Re: logical replication worker accesses catalogs in error context callback

2021-07-06 Thread Tom Lane
Bharath Rupireddy  writes:
> How about making the below else if statement and the attname
> assignment into a single line? They are falling below the 80 char
> limit.
> else if (colno > 0 && colno <= list_length(rte->eref->colnames))
> attname = strVal(list_nth(rte->eref->colnames, colno - 1));

Pushed that way.  (I think possibly I'd had this code further indented
in its first incarnation, thus the extra line breaks.)

regards, tom lane




Re: logical replication worker accesses catalogs in error context callback

2021-07-03 Thread Bharath Rupireddy
On Sat, Jul 3, 2021 at 10:03 PM Tom Lane  wrote:
>
> Bharath Rupireddy  writes:
> > The patch basically looks good to me except a minor comment to have a
> > local variable for var->varattno which makes the code shorter.
>
> Here's a restructured version that uses rangetable data for the
> simple-relation case too.  I also modified the relevant test cases
> so that it's visible that we're reporting aliases not true names.

How about making the below else if statement and the attname
assignment into a single line? They are falling below the 80 char
limit.
else if (colno > 0 && colno <= list_length(rte->eref->colnames))
attname = strVal(list_nth(rte->eref->colnames, colno - 1));

Otherwise the v8 patch looks good to me.

Regards,
Bharath Rupireddy.




Re: logical replication worker accesses catalogs in error context callback

2021-07-03 Thread Tom Lane
Bharath Rupireddy  writes:
> The patch basically looks good to me except a minor comment to have a
> local variable for var->varattno which makes the code shorter.

Here's a restructured version that uses rangetable data for the
simple-relation case too.  I also modified the relevant test cases
so that it's visible that we're reporting aliases not true names.

regards, tom lane

diff --git a/contrib/postgres_fdw/expected/postgres_fdw.out b/contrib/postgres_fdw/expected/postgres_fdw.out
index 31b5de91ad..25112df916 100644
--- a/contrib/postgres_fdw/expected/postgres_fdw.out
+++ b/contrib/postgres_fdw/expected/postgres_fdw.out
@@ -4096,15 +4096,17 @@ DROP TABLE reind_fdw_parent;
 -- conversion error
 -- ===
 ALTER FOREIGN TABLE ft1 ALTER COLUMN c8 TYPE int;
-SELECT * FROM ft1 WHERE c1 = 1;  -- ERROR
+SELECT * FROM ft1 ftx(x1,x2,x3,x4,x5,x6,x7,x8) WHERE x1 = 1;  -- ERROR
 ERROR:  invalid input syntax for type integer: "foo"
-CONTEXT:  column "c8" of foreign table "ft1"
-SELECT  ft1.c1,  ft2.c2, ft1.c8 FROM ft1, ft2 WHERE ft1.c1 = ft2.c1 AND ft1.c1 = 1; -- ERROR
+CONTEXT:  column "x8" of foreign table "ftx"
+SELECT ftx.x1, ft2.c2, ftx.x8 FROM ft1 ftx(x1,x2,x3,x4,x5,x6,x7,x8), ft2
+  WHERE ftx.x1 = ft2.c1 AND ftx.x1 = 1; -- ERROR
 ERROR:  invalid input syntax for type integer: "foo"
-CONTEXT:  column "c8" of foreign table "ft1"
-SELECT  ft1.c1,  ft2.c2, ft1 FROM ft1, ft2 WHERE ft1.c1 = ft2.c1 AND ft1.c1 = 1; -- ERROR
+CONTEXT:  column "x8" of foreign table "ftx"
+SELECT ftx.x1, ft2.c2, ftx FROM ft1 ftx(x1,x2,x3,x4,x5,x6,x7,x8), ft2
+  WHERE ftx.x1 = ft2.c1 AND ftx.x1 = 1; -- ERROR
 ERROR:  invalid input syntax for type integer: "foo"
-CONTEXT:  whole-row reference to foreign table "ft1"
+CONTEXT:  whole-row reference to foreign table "ftx"
 SELECT sum(c2), array_agg(c8) FROM ft1 GROUP BY c8; -- ERROR
 ERROR:  invalid input syntax for type integer: "foo"
 CONTEXT:  processing expression at position 2 in select list
diff --git a/contrib/postgres_fdw/postgres_fdw.c b/contrib/postgres_fdw/postgres_fdw.c
index fafbab6b02..696277ba10 100644
--- a/contrib/postgres_fdw/postgres_fdw.c
+++ b/contrib/postgres_fdw/postgres_fdw.c
@@ -302,16 +302,8 @@ typedef struct
  */
 typedef struct ConversionLocation
 {
-	Relation	rel;			/* foreign table's relcache entry. */
 	AttrNumber	cur_attno;		/* attribute number being processed, or 0 */
-
-	/*
-	 * In case of foreign join push down, fdw_scan_tlist is used to identify
-	 * the Var node corresponding to the error location and
-	 * fsstate->ss.ps.state gives access to the RTEs of corresponding relation
-	 * to get the relation name and attribute name.
-	 */
-	ForeignScanState *fsstate;
+	ForeignScanState *fsstate;	/* plan node being processed */
 } ConversionLocation;
 
 /* Callback argument for ec_member_matches_foreign */
@@ -7082,7 +7074,6 @@ make_tuple_from_result_row(PGresult *res,
 	/*
 	 * Set up and install callback to report where conversion error occurs.
 	 */
-	errpos.rel = rel;
 	errpos.cur_attno = 0;
 	errpos.fsstate = fsstate;
 	errcallback.callback = conversion_error_callback;
@@ -7185,34 +7176,32 @@ make_tuple_from_result_row(PGresult *res,
 /*
  * Callback function which is called when error occurs during column value
  * conversion.  Print names of column and relation.
+ *
+ * Note that this function mustn't do any catalog lookups, since we are in
+ * an already-failed transaction.  Fortunately, we can get the needed info
+ * from the query's rangetable instead.
  */
 static void
 conversion_error_callback(void *arg)
 {
+	ConversionLocation *errpos = (ConversionLocation *) arg;
+	ForeignScanState *fsstate = errpos->fsstate;
+	ForeignScan *fsplan = castNode(ForeignScan, fsstate->ss.ps.plan);
+	int			varno = 0;
+	AttrNumber	colno = 0;
 	const char *attname = NULL;
 	const char *relname = NULL;
 	bool		is_wholerow = false;
-	ConversionLocation *errpos = (ConversionLocation *) arg;
 
-	if (errpos->rel)
+	if (fsplan->scan.scanrelid > 0)
 	{
 		/* error occurred in a scan against a foreign table */
-		TupleDesc	tupdesc = RelationGetDescr(errpos->rel);
-		Form_pg_attribute attr = TupleDescAttr(tupdesc, errpos->cur_attno - 1);
-
-		if (errpos->cur_attno > 0 && errpos->cur_attno <= tupdesc->natts)
-			attname = NameStr(attr->attname);
-		else if (errpos->cur_attno == SelfItemPointerAttributeNumber)
-			attname = "ctid";
-
-		relname = RelationGetRelationName(errpos->rel);
+		varno = fsplan->scan.scanrelid;
+		colno = errpos->cur_attno;
 	}
 	else
 	{
 		/* error occurred in a scan against a foreign join */
-		ForeignScanState *fsstate = errpos->fsstate;
-		ForeignScan *fsplan = castNode(ForeignScan, fsstate->ss.ps.plan);
-		EState	   *estate = fsstate->ss.ps.state;
 		TargetEntry *tle;
 
 		tle = list_nth_node(TargetEntry, fsplan->fdw_scan_tlist,
@@ -7220,35 +7209,42 @@ conversion_error_callback(void *arg)
 
 		/*
 		 * Target list can have Vars and expressions.  For Vars, we can get
-		

Re: logical replication worker accesses catalogs in error context callback

2021-07-03 Thread Tom Lane
Bharath Rupireddy  writes:
> Isn't it better to have the below comment before
> slot_store_error_callback, similar to what's there before
> conversion_error_callback in v7-0002.
>  * Note that this function mustn't do any catalog lookups, since we are in
>  * an already-failed transaction.

Not really, as there's not much temptation to do so in the current form
of that function.  I have no desire to go around and plaster every
errcontext callback with such comments.

> Maybe it's worth considering
> avoid_sys_cache_lookup_in_error_callback_v2.patch from [1] as another
> way to enforce the above statement.

That looks fundamentally broken from here.  Wouldn't it forbid
perfectly OK code like this randomly-chosen example from tablecmds.c?

if (list_member_oid(inheritOids, parentOid))
ereport(ERROR,
(errcode(ERRCODE_DUPLICATE_TABLE),
 errmsg("relation \"%s\" would be inherited from more than 
once",
get_rel_name(parentOid;

That is, it's a bit hard to say exactly where in the error processing
sequence we should start deeming it unsafe to do a catalog lookup;
but the mere presence of an error stack entry can't mean that.

In a lot of situations, there wouldn't be any need to consider the
transaction broken until we've done a longjmp, so that catalog
lookups in errcontext callbacks would be perfectly safe.  (Which
indeed is why we've not yet seen an actual problem in either of
the spots discussed in this thread.)  The reason for being paranoid
about what an errcontext callback can do is that the callback cannot
know what the current failure's cause is.  If we're trying to report
some internal error that means that the transaction really is pretty
broken, then it'd be problematic to initiate new catalog accesses.

regards, tom lane




Re: logical replication worker accesses catalogs in error context callback

2021-07-02 Thread Bharath Rupireddy
On Sat, Jul 3, 2021 at 2:19 AM Tom Lane  wrote:
>
> I wrote:
> > Didn't look at 0002 yet.
>
> ... and now that I have, I don't like it much.  It adds a lot of
> complexity, plus some lookup cycles that might be wasted.  I experimented
> with looking into the range table as I suggested previously, and
> that seems to work; see attached.  (This includes a little bit of
> code cleanup along with the bug fix proper.)
>
> An interesting point here is that the range table data will represent
> table and column aliases, not necessarily their true names.  I don't
> find that wrong, it's just different from what the code presently
> does.  If we go with this, likely we should change the plain-relation
> code path so that it also prints aliases from the RTE instead of
> the actual names.

Thanks. This patch is way simpler than what I proposed. Also, I like
the generic message "processing expression at position %d in select
list" when relname or attname is not available.

The patch basically looks good to me except a minor comment to have a
local variable for var->varattno which makes the code shorter.
if (IsA(tle->expr, Var))
{
-   RangeTblEntry *rte;
Var*var = (Var *) tle->expr;
+   AttrNumber  attno = var->varattno;
+   RangeTblEntry *rte = exec_rt_fetch(var->varno, estate);

-   rte = exec_rt_fetch(var->varno, estate);
+   relname = rte->eref->aliasname;

-   if (var->varattno == 0)
+   if (attno == 0)
is_wholerow = true;
-   else
-   attname = get_attname(rte->relid,
var->varattno, false);
-
-   relname = get_rel_name(rte->relid);
+   else if (attno > 0 && attno <=
list_length(rte->eref->colnames))
+   attname =
strVal(list_nth(rte->eref->colnames, attno - 1));
}

Regards,
Bharath Rupireddy.




Re: logical replication worker accesses catalogs in error context callback

2021-07-02 Thread Bharath Rupireddy
On Sat, Jul 3, 2021 at 1:37 AM Tom Lane  wrote:
>
> Bharath Rupireddy  writes:
> >> << Attaching v5-0001 here again for completion >>
> >> I'm attaching v5-0001 patch that avoids printing the column type names
> >> in the context message thus no cache lookups have to be done in the
> >> error context callback. I think the column name is enough to know on
> >> which column the error occurred and if required it's type can be known
> >> by the user. This patch gets rid of printing local and remote type
> >> names in slot_store_error_callback and also
> >> logicalrep_typmap_gettypname because it's unnecessary. Thoughts?
>
> I've now pushed this patch.  I noted that once we drop
> logicalrep_typmap_gettypname, there's really nothing using
> the LogicalRepTypMap table at all, so I nuked that too.
> (We can always recover the code from the git archives if
> some reason to use it emerges.)

Isn't it better to have the below comment before
slot_store_error_callback, similar to what's there before
conversion_error_callback in v7-0002.
 * Note that this function mustn't do any catalog lookups, since we are in
 * an already-failed transaction.

Maybe it's worth considering
avoid_sys_cache_lookup_in_error_callback_v2.patch from [1] as another
way to enforce the above statement.

[1] - 
https://www.postgresql.org/message-id/CAD21AoAwxbd-zXXUAeJ2FBRHr%2B%3DbfMUHoN7xJuXcwu1sFi1-sQ%40mail.gmail.com

Regards,
Bharath Rupireddy.




Re: logical replication worker accesses catalogs in error context callback

2021-07-02 Thread Tom Lane
I wrote:
> Didn't look at 0002 yet.

... and now that I have, I don't like it much.  It adds a lot of
complexity, plus some lookup cycles that might be wasted.  I experimented
with looking into the range table as I suggested previously, and
that seems to work; see attached.  (This includes a little bit of
code cleanup along with the bug fix proper.)

An interesting point here is that the range table data will represent
table and column aliases, not necessarily their true names.  I don't
find that wrong, it's just different from what the code presently
does.  If we go with this, likely we should change the plain-relation
code path so that it also prints aliases from the RTE instead of
the actual names.

regards, tom lane

diff --git a/contrib/postgres_fdw/postgres_fdw.c b/contrib/postgres_fdw/postgres_fdw.c
index fafbab6b02..b40c331f8f 100644
--- a/contrib/postgres_fdw/postgres_fdw.c
+++ b/contrib/postgres_fdw/postgres_fdw.c
@@ -7185,6 +7185,10 @@ make_tuple_from_result_row(PGresult *res,
 /*
  * Callback function which is called when error occurs during column value
  * conversion.  Print names of column and relation.
+ *
+ * Note that this function mustn't do any catalog lookups, since we are in
+ * an already-failed transaction.  Fortunately, we can get info from the
+ * relcache entry (for a simple scan) or the query rangetable (for joins).
  */
 static void
 conversion_error_callback(void *arg)
@@ -7198,10 +7202,14 @@ conversion_error_callback(void *arg)
 	{
 		/* error occurred in a scan against a foreign table */
 		TupleDesc	tupdesc = RelationGetDescr(errpos->rel);
-		Form_pg_attribute attr = TupleDescAttr(tupdesc, errpos->cur_attno - 1);
 
 		if (errpos->cur_attno > 0 && errpos->cur_attno <= tupdesc->natts)
+		{
+			Form_pg_attribute attr = TupleDescAttr(tupdesc,
+   errpos->cur_attno - 1);
+
 			attname = NameStr(attr->attname);
+		}
 		else if (errpos->cur_attno == SelfItemPointerAttributeNumber)
 			attname = "ctid";
 
@@ -7220,35 +7228,32 @@ conversion_error_callback(void *arg)
 
 		/*
 		 * Target list can have Vars and expressions.  For Vars, we can get
-		 * its relation, however for expressions we can't.  Thus for
+		 * some information, however for expressions we can't.  Thus for
 		 * expressions, just show generic context message.
 		 */
 		if (IsA(tle->expr, Var))
 		{
-			RangeTblEntry *rte;
 			Var		   *var = (Var *) tle->expr;
+			RangeTblEntry *rte = exec_rt_fetch(var->varno, estate);
 
-			rte = exec_rt_fetch(var->varno, estate);
+			relname = rte->eref->aliasname;
 
 			if (var->varattno == 0)
 is_wholerow = true;
-			else
-attname = get_attname(rte->relid, var->varattno, false);
-
-			relname = get_rel_name(rte->relid);
+			else if (var->varattno > 0 &&
+	 var->varattno <= list_length(rte->eref->colnames))
+attname = strVal(list_nth(rte->eref->colnames,
+		  var->varattno - 1));
 		}
-		else
-			errcontext("processing expression at position %d in select list",
-	   errpos->cur_attno);
 	}
 
-	if (relname)
-	{
-		if (is_wholerow)
-			errcontext("whole-row reference to foreign table \"%s\"", relname);
-		else if (attname)
-			errcontext("column \"%s\" of foreign table \"%s\"", attname, relname);
-	}
+	if (relname && is_wholerow)
+		errcontext("whole-row reference to foreign table \"%s\"", relname);
+	else if (relname && attname)
+		errcontext("column \"%s\" of foreign table \"%s\"", attname, relname);
+	else
+		errcontext("processing expression at position %d in select list",
+   errpos->cur_attno);
 }
 
 /*


Re: logical replication worker accesses catalogs in error context callback

2021-07-02 Thread Tom Lane
Bharath Rupireddy  writes:
>> << Attaching v5-0001 here again for completion >>
>> I'm attaching v5-0001 patch that avoids printing the column type names
>> in the context message thus no cache lookups have to be done in the
>> error context callback. I think the column name is enough to know on
>> which column the error occurred and if required it's type can be known
>> by the user. This patch gets rid of printing local and remote type
>> names in slot_store_error_callback and also
>> logicalrep_typmap_gettypname because it's unnecessary. Thoughts?

I've now pushed this patch.  I noted that once we drop
logicalrep_typmap_gettypname, there's really nothing using
the LogicalRepTypMap table at all, so I nuked that too.
(We can always recover the code from the git archives if
some reason to use it emerges.)

Didn't look at 0002 yet.

regards, tom lane




Re: logical replication worker accesses catalogs in error context callback

2021-05-17 Thread Bharath Rupireddy
On Fri, Apr 16, 2021 at 3:23 PM Bharath Rupireddy <
bharath.rupireddyforpostg...@gmail.com> wrote:
>
> On Tue, Mar 16, 2021 at 2:21 AM Tom Lane  wrote:
> >
> > Bharath Rupireddy  writes:
> > > Thanks for pointing to the changes in the commit message. I corrected
> > > them. Attaching v4 patch set, consider it for further review.
> >
> > I took a quick look at this.  I'm quite worried about the potential
> > performance cost of the v4-0001 patch (the one for fixing
> > slot_store_error_callback).  Previously, we didn't pay any noticeable
> > cost for having the callback unless there actually was an error.
> > As patched, we perform several catalog lookups per column per row,
> > even in the non-error code path.  That seems like it'd be a noticeable
> > performance hit.  Just to add insult to injury, it leaks memory.
> >
> > I propose a more radical but simpler solution: let's just not bother
> > with including the type names in the context message.  How much are
> > they really buying?
>
> << Attaching v5-0001 here again for completion >>
> I'm attaching v5-0001 patch that avoids printing the column type names
> in the context message thus no cache lookups have to be done in the
> error context callback. I think the column name is enough to know on
> which column the error occurred and if required it's type can be known
> by the user. This patch gets rid of printing local and remote type
> names in slot_store_error_callback and also
> logicalrep_typmap_gettypname because it's unnecessary. Thoughts?
>
> > v4-0002 (for postgres_fdw's conversion_error_callback) has the same
> > problems, although mitigated a bit by not needing to do any catalog
> > lookups in the non-join case.  For the join case, I wonder whether
> > we could push the lookups back to plan setup time, so they don't
> > need to be done over again for each row.  (Not doing lookups at all
> > doesn't seem attractive there; it'd render the context message nearly
> > useless.)  A different idea is to try to get the column names out
> > of the query's rangetable instead of going to the catalogs.
>
> I'm attaching v5-0002 which stores the required attribute information
> for foreign joins in postgresBeginForeignScan which is a one time job
> as opposed to the per-row system catalog lookup v4-0001 was doing. I'm
> storing the foreign table relid(as key), relname and the retrieved
> attributes' attnum and attname into a hash table. Whenever a
> conversion error occurs, using relid, the hash table is looked up to
> fetch the relname and attname. Thoughts?

Attaching rebased v6 patches.

With Regards,
Bharath Rupireddy.
EnterpriseDB: http://www.enterprisedb.com
From a65e13d591bb151eb71eb8ec42834ce28a3db304 Mon Sep 17 00:00:00 2001
From: Bharath Rupireddy 
Date: Wed, 14 Apr 2021 13:25:16 +0530
Subject: [PATCH v6] Avoid Catalogue Accesses In slot_store_error_callback

Avoid accessing system catalogues inside slot_store_error_callback
error context callback, because the the entire transaction might
have been broken at this point. logicalrep_typmap_gettypname()
and format_type_be could search the sys cache.

As per suggestion from Tom, a simple solution is to just avoid
printing the column type names in the message, just the column
name is enough to know on which column the error occurred.

The above solution makes logicalrep_typmap_gettypname redundant
hence removed it.
---
 src/backend/replication/logical/relation.c | 49 --
 src/backend/replication/logical/worker.c   | 25 +--
 src/include/replication/logicalrelation.h  |  2 -
 3 files changed, 2 insertions(+), 74 deletions(-)

diff --git a/src/backend/replication/logical/relation.c b/src/backend/replication/logical/relation.c
index e861c0ff80..cbc5612dbe 100644
--- a/src/backend/replication/logical/relation.c
+++ b/src/backend/replication/logical/relation.c
@@ -490,55 +490,6 @@ logicalrep_typmap_update(LogicalRepTyp *remotetyp)
 	MemoryContextSwitchTo(oldctx);
 }
 
-/*
- * Fetch type name from the cache by remote type OID.
- *
- * Return a substitute value if we cannot find the data type; no message is
- * sent to the log in that case, because this is used by error callback
- * already.
- */
-char *
-logicalrep_typmap_gettypname(Oid remoteid)
-{
-	LogicalRepTyp *entry;
-	bool		found;
-
-	/* Internal types are mapped directly. */
-	if (remoteid < FirstGenbkiObjectId)
-	{
-		if (!get_typisdefined(remoteid))
-		{
-			/*
-			 * This can be caused by having a publisher with a higher
-			 * PostgreSQL major version than the subscriber.
-			 */
-			return psprintf("unrecognized %u", remoteid);
-		}
-
-		return format_type_be(remoteid);
-	}
-
-	if (LogicalRepTypMap == NULL)
-	{
-		/*
-		 * If the typemap is not initialized yet, we cannot possibly attempt
-		 * to search the hash table; but there's no way we know the type
-		 * locally yet, since we haven't received a message about this type,
-		 * so this is the best we can do.
-		 */
-		return psprintf("unrecognized %u", remoteid);
-	

Re: logical replication worker accesses catalogs in error context callback

2021-04-16 Thread Bharath Rupireddy
On Tue, Mar 16, 2021 at 2:21 AM Tom Lane  wrote:
>
> Bharath Rupireddy  writes:
> > Thanks for pointing to the changes in the commit message. I corrected
> > them. Attaching v4 patch set, consider it for further review.
>
> I took a quick look at this.  I'm quite worried about the potential
> performance cost of the v4-0001 patch (the one for fixing
> slot_store_error_callback).  Previously, we didn't pay any noticeable
> cost for having the callback unless there actually was an error.
> As patched, we perform several catalog lookups per column per row,
> even in the non-error code path.  That seems like it'd be a noticeable
> performance hit.  Just to add insult to injury, it leaks memory.
>
> I propose a more radical but simpler solution: let's just not bother
> with including the type names in the context message.  How much are
> they really buying?

<< Attaching v5-0001 here again for completion >>
I'm attaching v5-0001 patch that avoids printing the column type names
in the context message thus no cache lookups have to be done in the
error context callback. I think the column name is enough to know on
which column the error occurred and if required it's type can be known
by the user. This patch gets rid of printing local and remote type
names in slot_store_error_callback and also
logicalrep_typmap_gettypname because it's unnecessary. Thoughts?

> v4-0002 (for postgres_fdw's conversion_error_callback) has the same
> problems, although mitigated a bit by not needing to do any catalog
> lookups in the non-join case.  For the join case, I wonder whether
> we could push the lookups back to plan setup time, so they don't
> need to be done over again for each row.  (Not doing lookups at all
> doesn't seem attractive there; it'd render the context message nearly
> useless.)  A different idea is to try to get the column names out
> of the query's rangetable instead of going to the catalogs.

I'm attaching v5-0002 which stores the required attribute information
for foreign joins in postgresBeginForeignScan which is a one time job
as opposed to the per-row system catalog lookup v4-0001 was doing. I'm
storing the foreign table relid(as key), relname and the retrieved
attributes' attnum and attname into a hash table. Whenever a
conversion error occurs, using relid, the hash table is looked up to
fetch the relname and attname. Thoughts?

With Regards,
Bharath Rupireddy.
EnterpriseDB: http://www.enterprisedb.com


v5-0001-Avoid-Catalogue-Accesses-In-slot_store_error_call.patch
Description: Binary data


v5-0002-Avoid-Catalogue-Accesses-In-conversion_error_call.patch
Description: Binary data


Re: logical replication worker accesses catalogs in error context callback

2021-04-14 Thread Bharath Rupireddy
On Wed, Mar 17, 2021 at 4:52 PM Bharath Rupireddy
 wrote:
>
> On Tue, Mar 16, 2021 at 2:21 AM Tom Lane  wrote:
> > > Thanks for pointing to the changes in the commit message. I corrected
> > > them. Attaching v4 patch set, consider it for further review.
> >
> > I took a quick look at this.  I'm quite worried about the potential
> > performance cost of the v4-0001 patch (the one for fixing
> > slot_store_error_callback).  Previously, we didn't pay any noticeable
> > cost for having the callback unless there actually was an error.
> > As patched, we perform several catalog lookups per column per row,
> > even in the non-error code path.  That seems like it'd be a noticeable
> > performance hit.  Just to add insult to injury, it leaks memory.
> >
> > I propose a more radical but simpler solution: let's just not bother
> > with including the type names in the context message.  How much are
> > they really buying?
>
> Thanks. In that case, the message can only return the local and remote
> columns names and ignore the types (something like below). And the
> user will have to figure out what are the types of those columns in
> local and remote separately in case of error. Then the function
> logicalrep_typmap_gettypname can also be removed. I'm not sure if this
> is okay. Thoughts?

Hi Tom,

As suggested earlier, I'm attaching a v5 patch that avoids printing
the column type names in the context message thus no cache lookups
have to be done in the error context callback. I think the column name
is enough to know on which column the error occurred and if required
it's type can be known by the user. This patch gets rid of printing
local and remote type names in slot_store_error_callback and also
logicalrep_typmap_gettypname because it's unnecessary. I'm not sure if
this solution is acceptable. Thoughts?

With Regards,
Bharath Rupireddy.
EnterpriseDB: http://www.enterprisedb.com
From a65e13d591bb151eb71eb8ec42834ce28a3db304 Mon Sep 17 00:00:00 2001
From: Bharath Rupireddy 
Date: Wed, 14 Apr 2021 13:25:16 +0530
Subject: [PATCH v5] Avoid Catalogue Accesses In slot_store_error_callback

Avoid accessing system catalogues inside slot_store_error_callback
error context callback, because the the entire transaction might
have been broken at this point. logicalrep_typmap_gettypname()
and format_type_be could search the sys cache.

As per suggestion from Tom, a simple solution is to just avoid
printing the column type names in the message, just the column
name is enough to know on which column the error occurred.

The above solution makes logicalrep_typmap_gettypname redundant
hence removed it.
---
 src/backend/replication/logical/relation.c | 49 --
 src/backend/replication/logical/worker.c   | 25 +--
 src/include/replication/logicalrelation.h  |  2 -
 3 files changed, 2 insertions(+), 74 deletions(-)

diff --git a/src/backend/replication/logical/relation.c b/src/backend/replication/logical/relation.c
index e861c0ff80..cbc5612dbe 100644
--- a/src/backend/replication/logical/relation.c
+++ b/src/backend/replication/logical/relation.c
@@ -490,55 +490,6 @@ logicalrep_typmap_update(LogicalRepTyp *remotetyp)
 	MemoryContextSwitchTo(oldctx);
 }
 
-/*
- * Fetch type name from the cache by remote type OID.
- *
- * Return a substitute value if we cannot find the data type; no message is
- * sent to the log in that case, because this is used by error callback
- * already.
- */
-char *
-logicalrep_typmap_gettypname(Oid remoteid)
-{
-	LogicalRepTyp *entry;
-	bool		found;
-
-	/* Internal types are mapped directly. */
-	if (remoteid < FirstGenbkiObjectId)
-	{
-		if (!get_typisdefined(remoteid))
-		{
-			/*
-			 * This can be caused by having a publisher with a higher
-			 * PostgreSQL major version than the subscriber.
-			 */
-			return psprintf("unrecognized %u", remoteid);
-		}
-
-		return format_type_be(remoteid);
-	}
-
-	if (LogicalRepTypMap == NULL)
-	{
-		/*
-		 * If the typemap is not initialized yet, we cannot possibly attempt
-		 * to search the hash table; but there's no way we know the type
-		 * locally yet, since we haven't received a message about this type,
-		 * so this is the best we can do.
-		 */
-		return psprintf("unrecognized %u", remoteid);
-	}
-
-	/* search the mapping */
-	entry = hash_search(LogicalRepTypMap, (void *) ,
-		HASH_FIND, );
-	if (!found)
-		return psprintf("unrecognized %u", remoteid);
-
-	Assert(OidIsValid(entry->remoteid));
-	return psprintf("%s.%s", entry->nspname, entry->typname);
-}
-
 /*
  * Partition cache: look up partition LogicalRepRelMapEntry's
  *
diff --git a/src/backend/replication/logical/worker.c b/src/backend/replication/logical/worker.c
index fb3ba5c415..6494bef1d9 100644
--- a/src/backend/replication/logical/worker.c
+++ b/src/backend/replication/logical/worker.c
@@ -132,7 +132,6 @@ static dlist_head lsn_mapping = DLIST_STATIC_INIT(lsn_mapping);
 typedef struct SlotErrCallbackArg
 {
 	LogicalRepRelMapEntry *rel;
-	int			local_attnum;
 	int			

Re: logical replication worker accesses catalogs in error context callback

2021-03-17 Thread Bharath Rupireddy
On Tue, Mar 16, 2021 at 2:21 AM Tom Lane  wrote:
> > Thanks for pointing to the changes in the commit message. I corrected
> > them. Attaching v4 patch set, consider it for further review.
>
> I took a quick look at this.  I'm quite worried about the potential
> performance cost of the v4-0001 patch (the one for fixing
> slot_store_error_callback).  Previously, we didn't pay any noticeable
> cost for having the callback unless there actually was an error.
> As patched, we perform several catalog lookups per column per row,
> even in the non-error code path.  That seems like it'd be a noticeable
> performance hit.  Just to add insult to injury, it leaks memory.
>
> I propose a more radical but simpler solution: let's just not bother
> with including the type names in the context message.  How much are
> they really buying?

Thanks. In that case, the message can only return the local and remote
columns names and ignore the types (something like below). And the
user will have to figure out what are the types of those columns in
local and remote separately in case of error. Then the function
logicalrep_typmap_gettypname can also be removed. I'm not sure if this
is okay. Thoughts?

TupleDesc localtupdesc = RelationGetDescr(rel->localrel);
Form_pg_attribute localatt = TupleDescAttr(localtupdesc,
errarg->local_attnum - 1);

errcontext("processing remote data for replication target relation
\"%s.%s\" column \"%s\", "
   "remote column %s, local column %s",
   rel->remoterel.nspname, rel->remoterel.relname,
   rel->remoterel.attnames[errarg->remote_attnum],
  NameStr(localatt->attname)));

> v4-0002 (for postgres_fdw's conversion_error_callback) has the same
> problems, although mitigated a bit by not needing to do any catalog
> lookups in the non-join case.  For the join case, I wonder whether
> we could push the lookups back to plan setup time, so they don't
> need to be done over again for each row.  (Not doing lookups at all
> doesn't seem attractive there; it'd render the context message nearly
> useless.)  A different idea is to try to get the column names out
> of the query's rangetable instead of going to the catalogs.

I will think more on this point.

With Regards,
Bharath Rupireddy.
EnterpriseDB: http://www.enterprisedb.com




Re: logical replication worker accesses catalogs in error context callback

2021-03-15 Thread Tom Lane
Bharath Rupireddy  writes:
> Thanks for pointing to the changes in the commit message. I corrected
> them. Attaching v4 patch set, consider it for further review.

I took a quick look at this.  I'm quite worried about the potential
performance cost of the v4-0001 patch (the one for fixing
slot_store_error_callback).  Previously, we didn't pay any noticeable
cost for having the callback unless there actually was an error.
As patched, we perform several catalog lookups per column per row,
even in the non-error code path.  That seems like it'd be a noticeable
performance hit.  Just to add insult to injury, it leaks memory.

I propose a more radical but simpler solution: let's just not bother
with including the type names in the context message.  How much are
they really buying?

v4-0002 (for postgres_fdw's conversion_error_callback) has the same
problems, although mitigated a bit by not needing to do any catalog
lookups in the non-join case.  For the join case, I wonder whether
we could push the lookups back to plan setup time, so they don't
need to be done over again for each row.  (Not doing lookups at all
doesn't seem attractive there; it'd render the context message nearly
useless.)  A different idea is to try to get the column names out
of the query's rangetable instead of going to the catalogs.

regards, tom lane




Re: logical replication worker accesses catalogs in error context callback

2021-02-04 Thread Bharath Rupireddy
On Thu, Feb 4, 2021 at 5:42 PM Amit Kapila  wrote:
> > Thanks Amit. I verified it with gdb. I attached gdb to the logical
> > replication worker. In slot_store_data's for loop, I intentionally set
> > CurrentTransactionState->state = TRANS_DEFAULT,
> >
>
> What happens if you don't change CurrentTransactionState->state?

Just a refresher -  the problem we are trying to solve with the 2
patches proposed in this thread [1] is that the error callbacks
shouldn't be accessing system catalogues because the txn might be
broken by then or another error can be raised in system catalogue
access paths (if we have a cache miss and access the catalogue table
from the disk). So the required information that's needed in the error
callback is to be filled in the function that register the callback.

Now, coming to testing the patches: I tried to use the error callback
when we are outside of the xact. For 0001patch i.e.
slot_store_error_callback, I called it after the
apply_handle_commit_internal in apply_handle_commit. The expectation
is that the patches proposed in this thread [1], should just be usable
even outside of the xact.

HEAD: The logical replication worker crashes at
Assert(IsTransactionState() in SearchCatCacheInternal. See [2] for the
testing patch that's used.

Patched: The testing DEBUG message gets printed in the subscriber
server log and no crash happens. See [3] for the testing patch that's
used.
2021-02-05 12:02:32.340 IST [2361724] DEBUG:  calling slot_store_error_callback
2021-02-05 12:02:32.340 IST [2361724] CONTEXT:  processing remote data
for replication target relation "public.t1" column "a1", remote type
remote_test_type, local type local_test_type

Similarly, the 0002 patch in [1] which is avoiding system catalogues
in conversion_error_callback postgres_fdw can also be tested.

Hope this testing is enough for the patches. Please let me know if
anything more is required.

[1] - 
https://www.postgresql.org/message-id/CALj2ACUkvABzq6yeKQZsgng5Rd_NF%2BikhTvL9k4NR0GSRKsSdg%40mail.gmail.com
[2] -
@@ -713,6 +717,26 @@ apply_handle_commit(StringInfo s)
  apply_handle_commit_internal(s, _data);
 +if (rel_for_err_cb_chk)
+{
+/* We are out of xact. */
+Assert(!IsTransactionState());
+
+/* Push callback + info on the error context stack */
+errarg.rel = rel_for_err_cb_chk;
+errarg.local_attnum = 0;
+errarg.remote_attnum = 0;
+errcallback.callback = slot_store_error_callback;
+errcallback.arg = (void *) 
+errcallback.previous = error_context_stack;
+error_context_stack = 
+
+elog(DEBUG1, "calling slot_store_error_callback");
+
+/* Pop the error context stack */
+error_context_stack = errcallback.previous;
+rel_for_err_cb_chk = NULL;
+}
[3]
 @@ -719,6 +724,26 @@ apply_handle_commit(StringInfo s)
  apply_handle_commit_internal(s, _data);
 +if (rel_for_err_cb_chk)
+{
+/* We are out of xact. */
+Assert(!IsTransactionState());
+errarg.rel = rel_for_err_cb_chk;
+errarg.remote_attnum = 0;
+errarg.local_typename = "local_test_type";
+errarg.remote_typename = "remote_test_type";
+errcallback.callback = slot_store_error_callback;
+errcallback.arg = (void *) 
+errcallback.previous = error_context_stack;
+error_context_stack = 
+
+elog(DEBUG1, "calling slot_store_error_callback");
+
+/* Pop the error context stack */
+error_context_stack = errcallback.previous;
+rel_for_err_cb_chk = NULL;
+}

With Regards,
Bharath Rupireddy.
EnterpriseDB: http://www.enterprisedb.com




Re: logical replication worker accesses catalogs in error context callback

2021-02-04 Thread Andres Freund
Hi,

On 2021-02-04 15:08:42 +0530, Bharath Rupireddy wrote:
> On Thu, Feb 4, 2021 at 5:16 AM Andres Freund  wrote:
> > > > > About 0001, have we tried to reproduce the actual bug here which means
> > > > > when the error_callback is called we should face some problem? I feel
> > > > > with the correct testcase we should hit the Assert
> > > > > (Assert(IsTransactionState());) in SearchCatCacheInternal because
> > > > > there we expect the transaction to be in a valid state. I understand
> > > > > that the transaction is in a broken state at that time but having a
> > > > > testcase to hit the actual bug makes it easy to test the fix.
> > > >
> > > > I think it's easy to hit - add a trivial DEBUG message to XLogWrite(),
> > > > and run anything involving logical rep using a low enough log
> > > > level. There might even be enough messages with a low enough level
> > > > already somewhere in the relevant paths.
> > >
> > > I'm not sure how adding a DEBUG message to XLogWrite() would ensure
> > > the logical replication worker on the subscriber system enters
> > > slot_store_error_callback and hits the Assert(IsTransactionState());?
> > > Could you please elaborate? Or I may be missing something here to
> > > understand.
> >
> > XLogWrite() is in a critical section, the DEBUG messages triggers error
> > context callbacks to be called, the callbacks allocate memory, which
> > triggers an assertion.
>
> I see that XLogWrite() can be called from logical replication
> worker(apply_handle_commit->apply_handle_commit_internal->CommitTransactionCommand->CommitTransaction->RecordTransactionCommit->XLogFlush->XLogWrite
> --> by now the txn state is not TRANS_INPROGRESS, so
> IsTransactionState() can return false. Adding a DEBUG message there
> can call the error context callback.
>
> But the error context callback slot_store_error_callback, gets used in
> slot_store_data and slot_modify_data. In both places we just register
> the callback before an attribute processing for loop and deregister
> after it. So, when we are in those for loops, we expect to be in
> TRANS_INPROGRESS, see ensure_transaction before slot_store_data and
> the same is true for slot_modify_data. So, to really hit the assert
> failure in the catalogue access from slot_store_error_callback, first,
> we shouldn't be in TRANS_INPROGRESS in those for loops and have a
> DEBUG message. I'm not exactly sure how we achieve this.

I think you're focussing too much on IsTransactionState(). It's easier
to hit the critical section issues.

What makes the issue a bit hard to hit is that to hit it you need a
syscache miss while slot_store_error_callback is set up, and that cache
miss then needs to trigger a read from disk (to potentially trigger a
WAL flush), and then the slot_store_error_callback() needs to actually
do something.

With the below patch

$ cat /tmp/tmp_config.conf
log_min_messages=debug5
shared_buffers=1MB

make TEMP_CONFIG=/tmp/extra_config.conf -C src/test/subscription/ check

triggers an assertion most of the time. The catcache.c changes aren't
necessary for this test, but we should have them...

Regards,

Andres

diff --git c/src/backend/access/transam/xlog.c 
i/src/backend/access/transam/xlog.c
index f03bd473e2b..7f422b4f5f2 100644
--- c/src/backend/access/transam/xlog.c
+++ i/src/backend/access/transam/xlog.c
@@ -2432,6 +2432,8 @@ XLogWrite(XLogwrtRqst WriteRqst, bool flexible)
 int startidx;
 uint32  startoffset;
 
+elog(DEBUG1, "writing!");
+
 /* We should always be inside a critical section here */
 Assert(CritSectionCount > 0);
 
@@ -2885,6 +2887,8 @@ XLogFlush(XLogRecPtr record)
 
 START_CRIT_SECTION();
 
+elog(DEBUG1, "checking for flushing");
+
 /*
  * Since fsync is usually a horribly expensive operation, we try to
  * piggyback as much data as we can on each fsync: if we see any more data
diff --git c/src/backend/replication/logical/worker.c 
i/src/backend/replication/logical/worker.c
index eb7db89cef7..19aa76af889 100644
--- c/src/backend/replication/logical/worker.c
+++ i/src/backend/replication/logical/worker.c
@@ -435,6 +435,8 @@ slot_store_error_callback(void *arg)
 Oid remotetypoid,
 localtypoid;
 
+Assert(IsTransactionState() && CritSectionCount == 0);
+
 /* Nothing to do if remote attribute number is not set */
 if (errarg->remote_attnum < 0)
 return;
diff --git c/src/backend/utils/cache/catcache.c 
i/src/backend/utils/cache/catcache.c
index fa2b49c676e..c7335c08570 100644
--- c/src/backend/utils/cache/catcache.c
+++ i/src/backend/utils/cache/catcache.c
@@ -1210,7 +1210,7 @@ SearchCatCacheInternal(CatCache *cache,
 CatCTup*ct;
 
 /* Make sure we're in an xact, even if this ends up being a cache hit */
-Assert(IsTransactionState());
+Assert(IsTransactionState() && CritSectionCount == 0);
 
 Assert(cache->cc_nkeys == nkeys);
 
@@ -1523,6 +1523,9 @@ SearchCatCacheList(CatCache *cache,
 MemoryContext 

Re: logical replication worker accesses catalogs in error context callback

2021-02-04 Thread Amit Kapila
On Thu, Feb 4, 2021 at 5:38 PM Bharath Rupireddy
 wrote:
>
> On Thu, Feb 4, 2021 at 4:00 PM Amit Kapila  wrote:
> > > > About 0001, have we tried to reproduce the actual bug here which means
> > > > when the error_callback is called we should face some problem? I feel
> > > > with the correct testcase we should hit the Assert
> > > > (Assert(IsTransactionState());) in SearchCatCacheInternal because
> > > > there we expect the transaction to be in a valid state. I understand
> > > > that the transaction is in a broken state at that time but having a
> > > > testcase to hit the actual bug makes it easy to test the fix.
> > >
> > > I have not tried hitting the Assert(IsTransactionState() in
> > > SearchCatCacheInternal. To do that, I need to figure out hitting
> > > "incorrect binary data format in logical replication column" error in
> > > either slot_modify_data or slot_store_data so that we will enter the
> > > error callback slot_store_error_callback and then IsTransactionState()
> > > should return false i.e. txn shouldn't be in TRANS_INPROGRESS.
> > >
> >
> > Even, if you hit that via debugger it will be sufficient or you can
> > write another elog/ereport there to achieve the same. The exact test
> > case to hit that error is not mandatory.
>
> Thanks Amit. I verified it with gdb. I attached gdb to the logical
> replication worker. In slot_store_data's for loop, I intentionally set
> CurrentTransactionState->state = TRANS_DEFAULT,
>

What happens if you don't change CurrentTransactionState->state?

-- 
With Regards,
Amit Kapila.




Re: logical replication worker accesses catalogs in error context callback

2021-02-04 Thread Bharath Rupireddy
On Thu, Feb 4, 2021 at 4:00 PM Amit Kapila  wrote:
> > > About 0001, have we tried to reproduce the actual bug here which means
> > > when the error_callback is called we should face some problem? I feel
> > > with the correct testcase we should hit the Assert
> > > (Assert(IsTransactionState());) in SearchCatCacheInternal because
> > > there we expect the transaction to be in a valid state. I understand
> > > that the transaction is in a broken state at that time but having a
> > > testcase to hit the actual bug makes it easy to test the fix.
> >
> > I have not tried hitting the Assert(IsTransactionState() in
> > SearchCatCacheInternal. To do that, I need to figure out hitting
> > "incorrect binary data format in logical replication column" error in
> > either slot_modify_data or slot_store_data so that we will enter the
> > error callback slot_store_error_callback and then IsTransactionState()
> > should return false i.e. txn shouldn't be in TRANS_INPROGRESS.
> >
>
> Even, if you hit that via debugger it will be sufficient or you can
> write another elog/ereport there to achieve the same. The exact test
> case to hit that error is not mandatory.

Thanks Amit. I verified it with gdb. I attached gdb to the logical
replication worker. In slot_store_data's for loop, I intentionally set
CurrentTransactionState->state = TRANS_DEFAULT, and jumped to the
existing error "incorrect binary data format in logical replication
column", so that the slot_store_error_callback is called. While we are
in the error context callback:

On master: since the system catalogues are accessed in
slot_store_error_callback, the Assert(IsTransactionState() in
SearchCatCacheInternal failed and the error we intend to see is not
logged and we see below in the subscriber server log and the session
in the subscriber gets restarted.
2021-02-04 17:26:27.517 IST [2269230] ERROR:  could not send data to
WAL stream: server closed the connection unexpectedly
This probably means the server terminated abnormally
before or while processing the request.
2021-02-04 17:26:27.518 IST [2269190] LOG:  background worker "logical
replication worker" (PID 2269230) exited with exit code 1

With patch: since we avoided system catalogue access in
slot_store_error_callback, we see the error that we intentionally
jumped to, in the subscriber server log.
2021-02-04 17:27:37.542 IST [2269424] ERROR:  incorrect binary data
format in logical replication column 1
2021-02-04 17:27:37.542 IST [2269424] CONTEXT:  processing remote data
for replication target relation "public.t1" column "a1", remote type
integer, local type integer

With Regards,
Bharath Rupireddy.
EnterpriseDB: http://www.enterprisedb.com




Re: logical replication worker accesses catalogs in error context callback

2021-02-04 Thread Amit Kapila
On Wed, Feb 3, 2021 at 4:31 PM Bharath Rupireddy
 wrote:
>
> On Thu, Jan 28, 2021 at 11:14 AM Amit Kapila  wrote:
> >
> > On Wed, Jan 27, 2021 at 9:38 AM Bharath Rupireddy
> >  wrote:
> > >
> > > On Wed, Jan 27, 2021 at 7:48 AM Zhihong Yu  wrote:
> > >
> > > Thanks for pointing to the changes in the commit message. I corrected
> > > them. Attaching v4 patch set, consider it for further review.
> > >
> >
> > About 0001, have we tried to reproduce the actual bug here which means
> > when the error_callback is called we should face some problem? I feel
> > with the correct testcase we should hit the Assert
> > (Assert(IsTransactionState());) in SearchCatCacheInternal because
> > there we expect the transaction to be in a valid state. I understand
> > that the transaction is in a broken state at that time but having a
> > testcase to hit the actual bug makes it easy to test the fix.
>
> I have not tried hitting the Assert(IsTransactionState() in
> SearchCatCacheInternal. To do that, I need to figure out hitting
> "incorrect binary data format in logical replication column" error in
> either slot_modify_data or slot_store_data so that we will enter the
> error callback slot_store_error_callback and then IsTransactionState()
> should return false i.e. txn shouldn't be in TRANS_INPROGRESS.
>

Even, if you hit that via debugger it will be sufficient or you can
write another elog/ereport there to achieve the same. The exact test
case to hit that error is not mandatory.

-- 
With Regards,
Amit Kapila.




Re: logical replication worker accesses catalogs in error context callback

2021-02-04 Thread Bharath Rupireddy
On Thu, Feb 4, 2021 at 5:16 AM Andres Freund  wrote:
> > > > About 0001, have we tried to reproduce the actual bug here which means
> > > > when the error_callback is called we should face some problem? I feel
> > > > with the correct testcase we should hit the Assert
> > > > (Assert(IsTransactionState());) in SearchCatCacheInternal because
> > > > there we expect the transaction to be in a valid state. I understand
> > > > that the transaction is in a broken state at that time but having a
> > > > testcase to hit the actual bug makes it easy to test the fix.
> > >
> > > I think it's easy to hit - add a trivial DEBUG message to XLogWrite(),
> > > and run anything involving logical rep using a low enough log
> > > level. There might even be enough messages with a low enough level
> > > already somewhere in the relevant paths.
> >
> > I'm not sure how adding a DEBUG message to XLogWrite() would ensure
> > the logical replication worker on the subscriber system enters
> > slot_store_error_callback and hits the Assert(IsTransactionState());?
> > Could you please elaborate? Or I may be missing something here to
> > understand.
>
> XLogWrite() is in a critical section, the DEBUG messages triggers error
> context callbacks to be called, the callbacks allocate memory, which
> triggers an assertion.

I see that XLogWrite() can be called from logical replication
worker(apply_handle_commit->apply_handle_commit_internal->CommitTransactionCommand->CommitTransaction->RecordTransactionCommit->XLogFlush->XLogWrite
--> by now the txn state is not TRANS_INPROGRESS, so
IsTransactionState() can return false. Adding a DEBUG message there
can call the error context callback.

But the error context callback slot_store_error_callback, gets used in
slot_store_data and slot_modify_data. In both places we just register
the callback before an attribute processing for loop and deregister
after it. So, when we are in those for loops, we expect to be in
TRANS_INPROGRESS, see ensure_transaction before slot_store_data and
the same is true for slot_modify_data. So, to really hit the assert
failure in the catalogue access from slot_store_error_callback, first,
we shouldn't be in TRANS_INPROGRESS in those for loops and have a
DEBUG message. I'm not exactly sure how we achieve this.

Am I missing something?

With Regards,
Bharath Rupireddy.
EnterpriseDB: http://www.enterprisedb.com




Re: logical replication worker accesses catalogs in error context callback

2021-02-03 Thread Andres Freund
Hi,

On 2021-02-03 16:35:29 +0530, Bharath Rupireddy wrote:
> On Sat, Jan 30, 2021 at 8:23 AM Andres Freund  wrote:
> > On 2021-01-28 11:14:03 +0530, Amit Kapila wrote:
> > > On Wed, Jan 27, 2021 at 9:38 AM Bharath Rupireddy
> > >  wrote:
> > > >
> > > > On Wed, Jan 27, 2021 at 7:48 AM Zhihong Yu  wrote:
> > > >
> > > > Thanks for pointing to the changes in the commit message. I corrected
> > > > them. Attaching v4 patch set, consider it for further review.
> > > >
> > >
> > > About 0001, have we tried to reproduce the actual bug here which means
> > > when the error_callback is called we should face some problem? I feel
> > > with the correct testcase we should hit the Assert
> > > (Assert(IsTransactionState());) in SearchCatCacheInternal because
> > > there we expect the transaction to be in a valid state. I understand
> > > that the transaction is in a broken state at that time but having a
> > > testcase to hit the actual bug makes it easy to test the fix.
> >
> > I think it's easy to hit - add a trivial DEBUG message to XLogWrite(),
> > and run anything involving logical rep using a low enough log
> > level. There might even be enough messages with a low enough level
> > already somewhere in the relevant paths.
> 
> I'm not sure how adding a DEBUG message to XLogWrite() would ensure
> the logical replication worker on the subscriber system enters
> slot_store_error_callback and hits the Assert(IsTransactionState());?
> Could you please elaborate? Or I may be missing something here to
> understand.

XLogWrite() is in a critical section, the DEBUG messages triggers error
context callbacks to be called, the callbacks allocate memory, which
triggers an assertion.

Greetings,

Andres Freund




Re: logical replication worker accesses catalogs in error context callback

2021-02-03 Thread Bharath Rupireddy
On Sat, Jan 30, 2021 at 8:23 AM Andres Freund  wrote:
>
> Hi,
>
> On 2021-01-28 11:14:03 +0530, Amit Kapila wrote:
> > On Wed, Jan 27, 2021 at 9:38 AM Bharath Rupireddy
> >  wrote:
> > >
> > > On Wed, Jan 27, 2021 at 7:48 AM Zhihong Yu  wrote:
> > >
> > > Thanks for pointing to the changes in the commit message. I corrected
> > > them. Attaching v4 patch set, consider it for further review.
> > >
> >
> > About 0001, have we tried to reproduce the actual bug here which means
> > when the error_callback is called we should face some problem? I feel
> > with the correct testcase we should hit the Assert
> > (Assert(IsTransactionState());) in SearchCatCacheInternal because
> > there we expect the transaction to be in a valid state. I understand
> > that the transaction is in a broken state at that time but having a
> > testcase to hit the actual bug makes it easy to test the fix.
>
> I think it's easy to hit - add a trivial DEBUG message to XLogWrite(),
> and run anything involving logical rep using a low enough log
> level. There might even be enough messages with a low enough level
> already somewhere in the relevant paths.

I'm not sure how adding a DEBUG message to XLogWrite() would ensure
the logical replication worker on the subscriber system enters
slot_store_error_callback and hits the Assert(IsTransactionState());?
Could you please elaborate? Or I may be missing something here to
understand.

With Regards,
Bharath Rupireddy.
EnterpriseDB: http://www.enterprisedb.com




Re: logical replication worker accesses catalogs in error context callback

2021-02-03 Thread Bharath Rupireddy
On Thu, Jan 28, 2021 at 11:14 AM Amit Kapila  wrote:
>
> On Wed, Jan 27, 2021 at 9:38 AM Bharath Rupireddy
>  wrote:
> >
> > On Wed, Jan 27, 2021 at 7:48 AM Zhihong Yu  wrote:
> >
> > Thanks for pointing to the changes in the commit message. I corrected
> > them. Attaching v4 patch set, consider it for further review.
> >
>
> About 0001, have we tried to reproduce the actual bug here which means
> when the error_callback is called we should face some problem? I feel
> with the correct testcase we should hit the Assert
> (Assert(IsTransactionState());) in SearchCatCacheInternal because
> there we expect the transaction to be in a valid state. I understand
> that the transaction is in a broken state at that time but having a
> testcase to hit the actual bug makes it easy to test the fix.

I have not tried hitting the Assert(IsTransactionState() in
SearchCatCacheInternal. To do that, I need to figure out hitting
"incorrect binary data format in logical replication column" error in
either slot_modify_data or slot_store_data so that we will enter the
error callback slot_store_error_callback and then IsTransactionState()
should return false i.e. txn shouldn't be in TRANS_INPROGRESS. I'm not
sure how to do these things. I also looked at the commits 42c80c696e9
and ddfc9cb05 that added Assert(IsTransactionState()); in
SearchCatCacheInternal and RelationIdGetRelation for any relevant
tests or discussions, but I couldn't find anything.

Any further ideas in reproducing the issue?

With Regards,
Bharath Rupireddy.
EnterpriseDB: http://www.enterprisedb.com




Re: logical replication worker accesses catalogs in error context callback

2021-01-29 Thread Andres Freund
Hi,

On 2021-01-28 11:14:03 +0530, Amit Kapila wrote:
> On Wed, Jan 27, 2021 at 9:38 AM Bharath Rupireddy
>  wrote:
> >
> > On Wed, Jan 27, 2021 at 7:48 AM Zhihong Yu  wrote:
> >
> > Thanks for pointing to the changes in the commit message. I corrected
> > them. Attaching v4 patch set, consider it for further review.
> >
> 
> About 0001, have we tried to reproduce the actual bug here which means
> when the error_callback is called we should face some problem? I feel
> with the correct testcase we should hit the Assert
> (Assert(IsTransactionState());) in SearchCatCacheInternal because
> there we expect the transaction to be in a valid state. I understand
> that the transaction is in a broken state at that time but having a
> testcase to hit the actual bug makes it easy to test the fix.

I think it's easy to hit - add a trivial DEBUG message to XLogWrite(),
and run anything involving logical rep using a low enough log
level. There might even be enough messages with a low enough level
already somewhere in the relevant paths.

Greetings,

Andres Freund




Re: logical replication worker accesses catalogs in error context callback

2021-01-27 Thread Amit Kapila
On Wed, Jan 27, 2021 at 9:38 AM Bharath Rupireddy
 wrote:
>
> On Wed, Jan 27, 2021 at 7:48 AM Zhihong Yu  wrote:
>
> Thanks for pointing to the changes in the commit message. I corrected
> them. Attaching v4 patch set, consider it for further review.
>

About 0001, have we tried to reproduce the actual bug here which means
when the error_callback is called we should face some problem? I feel
with the correct testcase we should hit the Assert
(Assert(IsTransactionState());) in SearchCatCacheInternal because
there we expect the transaction to be in a valid state. I understand
that the transaction is in a broken state at that time but having a
testcase to hit the actual bug makes it easy to test the fix.

-- 
With Regards,
Amit Kapila.




Re: logical replication worker accesses catalogs in error context callback

2021-01-26 Thread Bharath Rupireddy
On Wed, Jan 27, 2021 at 7:48 AM Zhihong Yu  wrote:
> For 0002, a few comments on the description:
>
> bq. Avoid accessing system catalogues inside conversion_error_callback
>
> catalogues -> catalog
>
> bq. error context callback, because the the entire transaction might
>
> There is redundant 'the'
>
> bq. Since get_attname() and get_rel_name() could search the sys cache by
> store the required information
>
> store -> storing

Thanks for pointing to the changes in the commit message. I corrected
them. Attaching v4 patch set, consider it for further review.


With Regards,
Bharath Rupireddy.
EnterpriseDB: http://www.enterprisedb.com
From 88767e3176cb6758ac3dd0a0da91f3747f84a4ed Mon Sep 17 00:00:00 2001
From: Bharath Rupireddy 
Date: Tue, 26 Jan 2021 13:49:18 +0530
Subject: [PATCH v4] Avoid Catalogue Accesses In slot_store_error_callback

Avoid accessing system catalogues inside slot_store_error_callback
error context callback, because the the entire transaction might
have been broken at this point.

Since logicalrep_typmap_gettypname() could search the sys cache by
calling to format_type_be(), store both local and remote type names
to SlotErrCallbackArg so that we can just set the names in the error
callback without sys cache lookup.

Author: Masahiko Sawada
---
 src/backend/replication/logical/worker.c | 48 +---
 1 file changed, 27 insertions(+), 21 deletions(-)

diff --git a/src/backend/replication/logical/worker.c b/src/backend/replication/logical/worker.c
index eb7db89cef..f7c4fe9c4d 100644
--- a/src/backend/replication/logical/worker.c
+++ b/src/backend/replication/logical/worker.c
@@ -132,8 +132,9 @@ static dlist_head lsn_mapping = DLIST_STATIC_INIT(lsn_mapping);
 typedef struct SlotErrCallbackArg
 {
 	LogicalRepRelMapEntry *rel;
-	int			local_attnum;
 	int			remote_attnum;
+	char		*local_typename;
+	char		*remote_typename;
 } SlotErrCallbackArg;
 
 /*
@@ -430,30 +431,19 @@ static void
 slot_store_error_callback(void *arg)
 {
 	SlotErrCallbackArg *errarg = (SlotErrCallbackArg *) arg;
-	LogicalRepRelMapEntry *rel;
-	char	   *remotetypname;
-	Oid			remotetypoid,
-localtypoid;
+	LogicalRepRelMapEntry *rel = errarg->rel;
 
 	/* Nothing to do if remote attribute number is not set */
 	if (errarg->remote_attnum < 0)
 		return;
 
-	rel = errarg->rel;
-	remotetypoid = rel->remoterel.atttyps[errarg->remote_attnum];
-
-	/* Fetch remote type name from the LogicalRepTypMap cache */
-	remotetypname = logicalrep_typmap_gettypname(remotetypoid);
-
-	/* Fetch local type OID from the local sys cache */
-	localtypoid = get_atttype(rel->localreloid, errarg->local_attnum + 1);
+	Assert(errarg->local_typename && errarg->remote_typename);
 
 	errcontext("processing remote data for replication target relation \"%s.%s\" column \"%s\", "
 			   "remote type %s, local type %s",
 			   rel->remoterel.nspname, rel->remoterel.relname,
 			   rel->remoterel.attnames[errarg->remote_attnum],
-			   remotetypname,
-			   format_type_be(localtypoid));
+			   errarg->remote_typename, errarg->local_typename);
 }
 
 /*
@@ -474,8 +464,9 @@ slot_store_data(TupleTableSlot *slot, LogicalRepRelMapEntry *rel,
 
 	/* Push callback + info on the error context stack */
 	errarg.rel = rel;
-	errarg.local_attnum = -1;
 	errarg.remote_attnum = -1;
+	errarg.local_typename = NULL;
+	errarg.remote_typename = NULL;
 	errcallback.callback = slot_store_error_callback;
 	errcallback.arg = (void *) 
 	errcallback.previous = error_context_stack;
@@ -491,11 +482,17 @@ slot_store_data(TupleTableSlot *slot, LogicalRepRelMapEntry *rel,
 		if (!att->attisdropped && remoteattnum >= 0)
 		{
 			StringInfo	colvalue = >colvalues[remoteattnum];
+			Oid remotetypid = rel->remoterel.atttyps[remoteattnum];
 
 			Assert(remoteattnum < tupleData->ncols);
 
-			errarg.local_attnum = i;
+			/*
+			 * Set the attribute number and both local and remote type names
+			 * from the local sys cache and LogicalRepTypMap respectively.
+			 */
 			errarg.remote_attnum = remoteattnum;
+			errarg.local_typename = format_type_be(att->atttypid);
+			errarg.remote_typename = logicalrep_typmap_gettypname(remotetypid);
 
 			if (tupleData->colstatus[remoteattnum] == LOGICALREP_COLUMN_TEXT)
 			{
@@ -543,8 +540,9 @@ slot_store_data(TupleTableSlot *slot, LogicalRepRelMapEntry *rel,
 slot->tts_isnull[i] = true;
 			}
 
-			errarg.local_attnum = -1;
 			errarg.remote_attnum = -1;
+			errarg.local_typename = NULL;
+			errarg.remote_typename = NULL;
 		}
 		else
 		{
@@ -600,8 +598,9 @@ slot_modify_data(TupleTableSlot *slot, TupleTableSlot *srcslot,
 
 	/* For error reporting, push callback + info on the error context stack */
 	errarg.rel = rel;
-	errarg.local_attnum = -1;
 	errarg.remote_attnum = -1;
+	errarg.local_typename = NULL;
+	errarg.remote_typename = NULL;
 	errcallback.callback = slot_store_error_callback;
 	errcallback.arg = (void *) 
 	errcallback.previous = error_context_stack;
@@ -622,9 +621,15 @@ slot_modify_data(TupleTableSlot *slot, TupleTableSlot 

Re: logical replication worker accesses catalogs in error context callback

2021-01-26 Thread Zhihong Yu
For 0002, a few comments on the description:

bq. Avoid accessing system catalogues inside conversion_error_callback

catalogues -> catalog

bq. error context callback, because the the entire transaction might

There is redundant 'the'

bq. Since get_attname() and get_rel_name() could search the sys cache by
store the required information

store -> storing

Cheers

On Tue, Jan 26, 2021 at 5:17 PM Bharath Rupireddy <
bharath.rupireddyforpostg...@gmail.com> wrote:

> On Tue, Jan 26, 2021 at 11:29 AM Masahiko Sawada 
> wrote:
> > > IMHO, adding an assertion in SearchCatCacheInternal(which is a most
> > > generic code part within the server) with a few error context global
> > > variables may not be always safe. Because we might miss using the
> > > error context variables properly. Instead, we could add a comment in
> > > ErrorContextCallback structure saying something like, "it's not
> > > recommended to access any system catalogues within an error context
> > > callback when the callback is expected to be called while processing
> > > an error, because the transaction might have been broken in that
> > > case." And let the future callback developers take care of it.
> > >
> > > Thoughts?
> >
> > That sounds good to me. But I still also see the value to add an
> > assertion in SearchCatCacheInternal(). If we had it, we could find
> > these two bugs earlier.
> >
> > Anyway, this seems to be unrelated to this bug fixing so we can start
> > a new thread for that.
>
> +1 to start a new thread for that.
>
> > > As I said earlier [1], currently only two(there could be more) error
> > > context callbacks access the sys catalogues, one is in
> > > slot_store_error_callback which will be fixed with your patch. Another
> > > is in conversion_error_callback, we can also fix this by storing the
> > > relname, attname and other required info in ConversionLocation,
> > > something like the attached. Please have a look.
> > >
> > > Thoughts?
> >
> > Thank you for the patch!
> >
> > Here are some comments:
>
> Thanks for the review comments.
>
> > +static void set_error_callback_info(ConversionLocation *errpos,
> > +  Relation rel, int cur_attno,
> > +  ForeignScanState *fsstate);
> >
> > I'm concerned a bit that the function name is generic. How about
> > set_conversion_error_callback_info() or something to make the name
> > clear?
>
> Done.
>
> > ---
> > +static void
> > +conversion_error_callback(void *arg)
> > +{
> > +   ConversionLocation *errpos = (ConversionLocation *) arg;
> > +
> > +   if (errpos->show_generic_message)
> > +   errcontext("processing expression at position %d in
> > select list",
> > +   errpos->cur_attno);
> > +
> >
> > I think we can set this generic message to the error context when
> > errpos->relname is NULL instead of using show_generic_message.
>
> Right. Done.
>
> > ---
> > +   /*
> > +* Set error context callback info, so that we could avoid
> accessing
> > +* the system catalogues while processing the error in
> > +* conversion_error_callback. System catalogue accesses are not
> safe in
> > +* error context callbacks because the transaction might have
> been
> > +* broken by then.
> > +*/
> > +   set_error_callback_info(, rel, i, fsstate);
> >
> > Looking at other code, we use "catalog" rather than "catalogue" in
> > most places. Is it better to use "catalog" for consistency? FYI, the
> > "catalogue" is used at only three places in the server code:
>
> Changed it to "catalog".
>
> > FYI I've added those bug fixes to the next Commitfest -
> https://commitfest.postgresql.org/32/2955/
>
> Thanks. I'm attaching 2 patches to make it easy for reviewing and also
> they will get a chance to be tested by cf bot.
>
> 0001 - for avoiding system catalog access in slot_store_error_callback.
> 0002 - for avoiding system catalog access in conversion_error_callback
>
> Please review it further.
>
> With Regards,
> Bharath Rupireddy.
> EnterpriseDB: http://www.enterprisedb.com
>


Re: logical replication worker accesses catalogs in error context callback

2021-01-26 Thread Bharath Rupireddy
On Tue, Jan 26, 2021 at 11:29 AM Masahiko Sawada  wrote:
> > IMHO, adding an assertion in SearchCatCacheInternal(which is a most
> > generic code part within the server) with a few error context global
> > variables may not be always safe. Because we might miss using the
> > error context variables properly. Instead, we could add a comment in
> > ErrorContextCallback structure saying something like, "it's not
> > recommended to access any system catalogues within an error context
> > callback when the callback is expected to be called while processing
> > an error, because the transaction might have been broken in that
> > case." And let the future callback developers take care of it.
> >
> > Thoughts?
>
> That sounds good to me. But I still also see the value to add an
> assertion in SearchCatCacheInternal(). If we had it, we could find
> these two bugs earlier.
>
> Anyway, this seems to be unrelated to this bug fixing so we can start
> a new thread for that.

+1 to start a new thread for that.

> > As I said earlier [1], currently only two(there could be more) error
> > context callbacks access the sys catalogues, one is in
> > slot_store_error_callback which will be fixed with your patch. Another
> > is in conversion_error_callback, we can also fix this by storing the
> > relname, attname and other required info in ConversionLocation,
> > something like the attached. Please have a look.
> >
> > Thoughts?
>
> Thank you for the patch!
>
> Here are some comments:

Thanks for the review comments.

> +static void set_error_callback_info(ConversionLocation *errpos,
> +  Relation rel, int cur_attno,
> +  ForeignScanState *fsstate);
>
> I'm concerned a bit that the function name is generic. How about
> set_conversion_error_callback_info() or something to make the name
> clear?

Done.

> ---
> +static void
> +conversion_error_callback(void *arg)
> +{
> +   ConversionLocation *errpos = (ConversionLocation *) arg;
> +
> +   if (errpos->show_generic_message)
> +   errcontext("processing expression at position %d in
> select list",
> +   errpos->cur_attno);
> +
>
> I think we can set this generic message to the error context when
> errpos->relname is NULL instead of using show_generic_message.

Right. Done.

> ---
> +   /*
> +* Set error context callback info, so that we could avoid accessing
> +* the system catalogues while processing the error in
> +* conversion_error_callback. System catalogue accesses are not safe 
> in
> +* error context callbacks because the transaction might have been
> +* broken by then.
> +*/
> +   set_error_callback_info(, rel, i, fsstate);
>
> Looking at other code, we use "catalog" rather than "catalogue" in
> most places. Is it better to use "catalog" for consistency? FYI, the
> "catalogue" is used at only three places in the server code:

Changed it to "catalog".

> FYI I've added those bug fixes to the next Commitfest - 
> https://commitfest.postgresql.org/32/2955/

Thanks. I'm attaching 2 patches to make it easy for reviewing and also
they will get a chance to be tested by cf bot.

0001 - for avoiding system catalog access in slot_store_error_callback.
0002 - for avoiding system catalog access in conversion_error_callback

Please review it further.

With Regards,
Bharath Rupireddy.
EnterpriseDB: http://www.enterprisedb.com


v3-0001-Avoid-Catalogue-Accesses-In-slot_store_error_call.patch
Description: Binary data


v3-0002-Avoid-Catalogue-Accesses-In-conversion_error_call.patch
Description: Binary data


Re: logical replication worker accesses catalogs in error context callback

2021-01-25 Thread Masahiko Sawada
On Tue, Jan 12, 2021 at 6:33 PM Bharath Rupireddy
 wrote:
>
> On Tue, Jan 12, 2021 at 9:40 AM Masahiko Sawada  wrote:
> >
> > On Mon, Jan 11, 2021 at 4:46 PM Bharath Rupireddy
> >  wrote:
> > >
> > > On Mon, Jan 11, 2021 at 10:56 AM Masahiko Sawada  
> > > wrote:
> > > > Agreed. Attached the updated patch.
> > >
> > > Thanks for the updated patch. Looks like the comment crosses the 80
> > > char limit, I have corrected it. And also changed the variable name
> > > from remotetypeid to remotetypid, so that logicalrep_typmap_gettypname
> > > will not cross the 80 char limit. And also added a commit message.
> > > Attaching v3 patch, please have a look. Both make check and make
> > > check-world passes.
> >
> > Thanks! The change looks good to me.
>
> Thanks!
>
> > > > > I quickly searched in places where error callbacks are being used, I
> > > > > think we need a similar kind of fix for conversion_error_callback in
> > > > > postgres_fdw.c, because get_attname and get_rel_name are being used
> > > > > which do catalogue lookups. ISTM that all the other error callbacks
> > > > > are good in the sense that they are not doing sys catalogue lookups.
> > > >
> > > > Indeed. If we need to disallow the catalog lookup during executing
> > > > error callbacks we might want to have an assertion checking that in
> > > > SearchCatCacheInternal(), in addition to Assert(IsTransactionState()).
> > >
> > > I tried to add(as attached in
> > > v1-0001-Avoid-Sys-Cache-Lookups-in-Error-Callbacks.patch) the
> > > Assert(!error_context_stack); in SearchCatCacheInternal, initdb itself
> > > fails [1]. That means, we are doing a bunch of catalogue access from
> > > error context callbacks. Given this, I'm not quite sure whether we can
> > > have such an assertion in SearchCatCacheInternal.
> >
> > I think checking !error_context_stack is not a correct check if we're
> > executing an error context callback since it's a stack to store
> > callbacks. It can be non-NULL by setting an error callback, see
> > setup_parser_errposition_callback() for instance.
>
> Thanks! Yes, you are right, even though we are not processing the
> actual error callback, the error_context_stack can be non-null, hence
> the Assert(!error_context_stack); doesn't make any sense.
>
> > Probably we need to check if (recursion_depth > 0) and elevel.
> > Attached a patch for that as an example.
>
> IIUC, we must be knowing in SearchCatCacheInternal, whether errstart
> is called with elevel >= ERROR and we have recursion_depth > 0. If
> both are true, then the assertion in SearchCatCacheInternal should
> fail. I see that in your patch, elevel is being fetched from
> errordata, that's fine. What I'm not quite clear is the following
> assumption:
>
> +/* If we doesn't set any error data yet, assume it's an error */
> +if (errordata_stack_depth == -1)
> +return true;
>
> Is it always true that we are in error processing when
> errordata_stack_depth is -1, what happens if errordata_stack_depth <
> -1? Maybe I'm missing something.

You're right. I missed something.

>
> IMHO, adding an assertion in SearchCatCacheInternal(which is a most
> generic code part within the server) with a few error context global
> variables may not be always safe. Because we might miss using the
> error context variables properly. Instead, we could add a comment in
> ErrorContextCallback structure saying something like, "it's not
> recommended to access any system catalogues within an error context
> callback when the callback is expected to be called while processing
> an error, because the transaction might have been broken in that
> case." And let the future callback developers take care of it.
>
> Thoughts?

That sounds good to me. But I still also see the value to add an
assertion in SearchCatCacheInternal(). If we had it, we could find
these two bugs earlier.

Anyway, this seems to be unrelated to this bug fixing so we can start
a new thread for that.

>
> As I said earlier [1], currently only two(there could be more) error
> context callbacks access the sys catalogues, one is in
> slot_store_error_callback which will be fixed with your patch. Another
> is in conversion_error_callback, we can also fix this by storing the
> relname, attname and other required info in ConversionLocation,
> something like the attached. Please have a look.
>
> Thoughts?

Thank you for the patch!

Here are some comments:

+static void set_error_callback_info(ConversionLocation *errpos,
+  Relation rel, int cur_attno,
+  ForeignScanState *fsstate);

I'm concerned a bit that the function name is generic. How about
set_conversion_error_callback_info() or something to make the name
clear?

---
+static void
+conversion_error_callback(void *arg)
+{
+   ConversionLocation *errpos = (ConversionLocation *) arg;
+
+   if (errpos->show_generic_message)
+   errcontext("processing expression at position %d 

Re: logical replication worker accesses catalogs in error context callback

2021-01-12 Thread Bharath Rupireddy
On Tue, Jan 12, 2021 at 9:40 AM Masahiko Sawada  wrote:
>
> On Mon, Jan 11, 2021 at 4:46 PM Bharath Rupireddy
>  wrote:
> >
> > On Mon, Jan 11, 2021 at 10:56 AM Masahiko Sawada  
> > wrote:
> > > Agreed. Attached the updated patch.
> >
> > Thanks for the updated patch. Looks like the comment crosses the 80
> > char limit, I have corrected it. And also changed the variable name
> > from remotetypeid to remotetypid, so that logicalrep_typmap_gettypname
> > will not cross the 80 char limit. And also added a commit message.
> > Attaching v3 patch, please have a look. Both make check and make
> > check-world passes.
>
> Thanks! The change looks good to me.

Thanks!

> > > > I quickly searched in places where error callbacks are being used, I
> > > > think we need a similar kind of fix for conversion_error_callback in
> > > > postgres_fdw.c, because get_attname and get_rel_name are being used
> > > > which do catalogue lookups. ISTM that all the other error callbacks
> > > > are good in the sense that they are not doing sys catalogue lookups.
> > >
> > > Indeed. If we need to disallow the catalog lookup during executing
> > > error callbacks we might want to have an assertion checking that in
> > > SearchCatCacheInternal(), in addition to Assert(IsTransactionState()).
> >
> > I tried to add(as attached in
> > v1-0001-Avoid-Sys-Cache-Lookups-in-Error-Callbacks.patch) the
> > Assert(!error_context_stack); in SearchCatCacheInternal, initdb itself
> > fails [1]. That means, we are doing a bunch of catalogue access from
> > error context callbacks. Given this, I'm not quite sure whether we can
> > have such an assertion in SearchCatCacheInternal.
>
> I think checking !error_context_stack is not a correct check if we're
> executing an error context callback since it's a stack to store
> callbacks. It can be non-NULL by setting an error callback, see
> setup_parser_errposition_callback() for instance.

Thanks! Yes, you are right, even though we are not processing the
actual error callback, the error_context_stack can be non-null, hence
the Assert(!error_context_stack); doesn't make any sense.

> Probably we need to check if (recursion_depth > 0) and elevel.
> Attached a patch for that as an example.

IIUC, we must be knowing in SearchCatCacheInternal, whether errstart
is called with elevel >= ERROR and we have recursion_depth > 0. If
both are true, then the assertion in SearchCatCacheInternal should
fail. I see that in your patch, elevel is being fetched from
errordata, that's fine. What I'm not quite clear is the following
assumption:

+/* If we doesn't set any error data yet, assume it's an error */
+if (errordata_stack_depth == -1)
+return true;

Is it always true that we are in error processing when
errordata_stack_depth is -1, what happens if errordata_stack_depth <
-1? Maybe I'm missing something.

IMHO, adding an assertion in SearchCatCacheInternal(which is a most
generic code part within the server) with a few error context global
variables may not be always safe. Because we might miss using the
error context variables properly. Instead, we could add a comment in
ErrorContextCallback structure saying something like, "it's not
recommended to access any system catalogues within an error context
callback when the callback is expected to be called while processing
an error, because the transaction might have been broken in that
case." And let the future callback developers take care of it.

Thoughts?

As I said earlier [1], currently only two(there could be more) error
context callbacks access the sys catalogues, one is in
slot_store_error_callback which will be fixed with your patch. Another
is in conversion_error_callback, we can also fix this by storing the
relname, attname and other required info in ConversionLocation,
something like the attached. Please have a look.

Thoughts?

With Regards,
Bharath Rupireddy.
EnterpriseDB: http://www.enterprisedb.com
From 8212d2d635096096211d4330ef08d6a21d4ad568 Mon Sep 17 00:00:00 2001
From: Bharath Rupireddy 
Date: Tue, 12 Jan 2021 14:42:45 +0530
Subject: [PATCH v1] Avoid Catalogue Accesses In conversion_error_callback

Avoid accessing system catalogues inside conversion_error_callback
error context callback, because the the entire transaction might
have been broken at this point.

Since get_attname() and get_rel_name() could search the sys cache by
store the required information into ConversionLocation so that we can
just use that info in the error callback without sys cache lookup.

Author: Bharath Rupireddy
---
 contrib/postgres_fdw/postgres_fdw.c | 102 ++--
 1 file changed, 68 insertions(+), 34 deletions(-)

diff --git a/contrib/postgres_fdw/postgres_fdw.c b/contrib/postgres_fdw/postgres_fdw.c
index 2f2d4d171c..56735821fe 100644
--- a/contrib/postgres_fdw/postgres_fdw.c
+++ b/contrib/postgres_fdw/postgres_fdw.c
@@ -283,16 +283,11 @@ typedef struct
  */
 typedef struct ConversionLocation
 {
-	Relation	rel;			/* foreign table's 

Re: logical replication worker accesses catalogs in error context callback

2021-01-11 Thread Masahiko Sawada
On Mon, Jan 11, 2021 at 4:46 PM Bharath Rupireddy
 wrote:
>
> On Mon, Jan 11, 2021 at 10:56 AM Masahiko Sawada  
> wrote:
> > Agreed. Attached the updated patch.
>
> Thanks for the updated patch. Looks like the comment crosses the 80
> char limit, I have corrected it. And also changed the variable name
> from remotetypeid to remotetypid, so that logicalrep_typmap_gettypname
> will not cross the 80 char limit. And also added a commit message.
> Attaching v3 patch, please have a look. Both make check and make
> check-world passes.

Thanks! The change looks good to me.

>
> > > I quickly searched in places where error callbacks are being used, I
> > > think we need a similar kind of fix for conversion_error_callback in
> > > postgres_fdw.c, because get_attname and get_rel_name are being used
> > > which do catalogue lookups. ISTM that all the other error callbacks
> > > are good in the sense that they are not doing sys catalogue lookups.
> >
> > Indeed. If we need to disallow the catalog lookup during executing
> > error callbacks we might want to have an assertion checking that in
> > SearchCatCacheInternal(), in addition to Assert(IsTransactionState()).
>
> I tried to add(as attached in
> v1-0001-Avoid-Sys-Cache-Lookups-in-Error-Callbacks.patch) the
> Assert(!error_context_stack); in SearchCatCacheInternal, initdb itself
> fails [1]. That means, we are doing a bunch of catalogue access from
> error context callbacks. Given this, I'm not quite sure whether we can
> have such an assertion in SearchCatCacheInternal.

I think checking !error_context_stack is not a correct check if we're
executing an error context callback since it's a stack to store
callbacks. It can be non-NULL by setting an error callback, see
setup_parser_errposition_callback() for instance. Probably we need to
check if (recursion_depth > 0) and elevel. Attached a patch for that
as an example.

>
> Although unrelated to what we are discussing here -  when I looked at
> SearchCatCacheInternal, I found that the function SearchCatCache has
> SearchCatCacheInternal in the function comment, I think we should
> correct it. Thoughts? If okay, I will post a separate patch for this
> minor comment fix.

Perhaps you mean this?

/*
 *  SearchCatCacheInternal
 *
 *  This call searches a system cache for a tuple, opening the relation
 *  if necessary (on the first access to a particular cache).
 *
 *  The result is NULL if not found, or a pointer to a HeapTuple in
 *  the cache.  The caller must not modify the tuple, and must call
 *  ReleaseCatCache() when done with it.
 *
 * The search key values should be expressed as Datums of the key columns'
 * datatype(s).  (Pass zeroes for any unused parameters.)  As a special
 * exception, the passed-in key for a NAME column can be just a C string;
 * the caller need not go to the trouble of converting it to a fully
 * null-padded NAME.
 */
HeapTuple
SearchCatCache(CatCache *cache,

Looking at commit 141fd1b66 it intentionally changed to
SearchCatCacheInternal from SearchCatCache but it seems to me that
it's a typo.

Regards,

--
Masahiko Sawada
EnterpriseDB:  https://www.enterprisedb.com/


avoid_sys_cache_lookup_in_error_callback_v2.patch
Description: Binary data


Re: logical replication worker accesses catalogs in error context callback

2021-01-10 Thread Bharath Rupireddy
On Mon, Jan 11, 2021 at 10:56 AM Masahiko Sawada  wrote:
> Agreed. Attached the updated patch.

Thanks for the updated patch. Looks like the comment crosses the 80
char limit, I have corrected it. And also changed the variable name
from remotetypeid to remotetypid, so that logicalrep_typmap_gettypname
will not cross the 80 char limit. And also added a commit message.
Attaching v3 patch, please have a look. Both make check and make
check-world passes.

> > I quickly searched in places where error callbacks are being used, I
> > think we need a similar kind of fix for conversion_error_callback in
> > postgres_fdw.c, because get_attname and get_rel_name are being used
> > which do catalogue lookups. ISTM that all the other error callbacks
> > are good in the sense that they are not doing sys catalogue lookups.
>
> Indeed. If we need to disallow the catalog lookup during executing
> error callbacks we might want to have an assertion checking that in
> SearchCatCacheInternal(), in addition to Assert(IsTransactionState()).

I tried to add(as attached in
v1-0001-Avoid-Sys-Cache-Lookups-in-Error-Callbacks.patch) the
Assert(!error_context_stack); in SearchCatCacheInternal, initdb itself
fails [1]. That means, we are doing a bunch of catalogue access from
error context callbacks. Given this, I'm not quite sure whether we can
have such an assertion in SearchCatCacheInternal.

Although unrelated to what we are discussing here -  when I looked at
SearchCatCacheInternal, I found that the function SearchCatCache has
SearchCatCacheInternal in the function comment, I think we should
correct it. Thoughts? If okay, I will post a separate patch for this
minor comment fix.

[1]
running bootstrap script ... TRAP:
FailedAssertion("error_context_stack", File: "catcache.c", Line: 1220,
PID: 310728)
/home/bharath/workspace/postgres/instnew/bin/postgres(ExceptionalCondition+0xd0)[0x56056984c8ba]
/home/bharath/workspace/postgres/instnew/bin/postgres(+0x76eb1a)[0x560569826b1a]
/home/bharath/workspace/postgres/instnew/bin/postgres(SearchCatCache1+0x3a)[0x5605698269af]
/home/bharath/workspace/postgres/instnew/bin/postgres(SearchSysCache1+0xc1)[0x5605698448b2]
/home/bharath/workspace/postgres/instnew/bin/postgres(get_typtype+0x1f)[0x56056982e389]
/home/bharath/workspace/postgres/instnew/bin/postgres(CheckAttributeType+0x29)[0x5605692bafe9]
/home/bharath/workspace/postgres/instnew/bin/postgres(CheckAttributeNamesTypes+0x2c9)[0x5605692bafac]
/home/bharath/workspace/postgres/instnew/bin/postgres(heap_create_with_catalog+0x11f)[0x5605692bc436]
/home/bharath/workspace/postgres/instnew/bin/postgres(boot_yyparse+0x7f0)[0x5605692a0d3f]
/home/bharath/workspace/postgres/instnew/bin/postgres(+0x1ecb36)[0x5605692a4b36]
/home/bharath/workspace/postgres/instnew/bin/postgres(AuxiliaryProcessMain+0x5e0)[0x5605692a4997]
/home/bharath/workspace/postgres/instnew/bin/postgres(main+0x268)[0x5605694c6bce]
/lib/x86_64-linux-gnu/libc.so.6(__libc_start_main+0xf3)[0x7fa2777ce0b3]
/home/bharath/workspace/postgres/instnew/bin/postgres(_start+0x2e)[0x5605691794de]

With Regards,
Bharath Rupireddy.
EnterpriseDB: http://www.enterprisedb.com
From a806415a49e3fc8c1e71a102aefb0c10c08deb05 Mon Sep 17 00:00:00 2001
From: Bharath Rupireddy 
Date: Mon, 11 Jan 2021 12:26:12 +0530
Subject: [PATCH v3] fix slot store error callback

Avoid accessing system catalogues inside slot_store_error_callback
error context callback, because the the entire transaction might
have been broken at this point.

Since logicalrep_typmap_gettypname() could search the sys cache by
calling to format_type_be(), store both local and remote type names
to SlotErrCallbackArg so that we can just set the names in the error
callback without sys cache lookup.

Author: Masahiko Sawada
---
 src/backend/replication/logical/worker.c | 50 +---
 1 file changed, 28 insertions(+), 22 deletions(-)

diff --git a/src/backend/replication/logical/worker.c b/src/backend/replication/logical/worker.c
index 1b1d70ed68..95f28c9acf 100644
--- a/src/backend/replication/logical/worker.c
+++ b/src/backend/replication/logical/worker.c
@@ -132,8 +132,9 @@ static dlist_head lsn_mapping = DLIST_STATIC_INIT(lsn_mapping);
 typedef struct SlotErrCallbackArg
 {
 	LogicalRepRelMapEntry *rel;
-	int			local_attnum;
 	int			remote_attnum;
+	char		*local_typename;
+	char		*remote_typename;
 } SlotErrCallbackArg;
 
 /*
@@ -430,30 +431,19 @@ static void
 slot_store_error_callback(void *arg)
 {
 	SlotErrCallbackArg *errarg = (SlotErrCallbackArg *) arg;
-	LogicalRepRelMapEntry *rel;
-	char	   *remotetypname;
-	Oid			remotetypoid,
-localtypoid;
+	LogicalRepRelMapEntry *rel = errarg->rel;
 
 	/* Nothing to do if remote attribute number is not set */
 	if (errarg->remote_attnum < 0)
 		return;
 
-	rel = errarg->rel;
-	remotetypoid = rel->remoterel.atttyps[errarg->remote_attnum];
-
-	/* Fetch remote type name from the LogicalRepTypMap cache */
-	remotetypname = logicalrep_typmap_gettypname(remotetypoid);
-
-	/* Fetch local 

Re: logical replication worker accesses catalogs in error context callback

2021-01-10 Thread Masahiko Sawada
On Sat, Jan 9, 2021 at 2:57 PM Bharath Rupireddy
 wrote:
>
> On Thu, Jan 7, 2021 at 5:47 AM Masahiko Sawada  wrote:
> >
> > On Wed, Jan 6, 2021 at 11:42 AM Masahiko Sawada  
> > wrote:
> > >
> > > On Wed, Jan 6, 2021 at 11:02 AM Andres Freund  wrote:
> > > >
> > > > Hi,
> > > >
> > > > Due to a debug ereport I just noticed that worker.c's
> > > > slot_store_error_callback is doing something quite dangerous:
> > > >
> > > > static void
> > > > slot_store_error_callback(void *arg)
> > > > {
> > > > SlotErrCallbackArg *errarg = (SlotErrCallbackArg *) arg;
> > > > LogicalRepRelMapEntry *rel;
> > > > char   *remotetypname;
> > > > Oid remotetypoid,
> > > > localtypoid;
> > > >
> > > > /* Nothing to do if remote attribute number is not set */
> > > > if (errarg->remote_attnum < 0)
> > > > return;
> > > >
> > > > rel = errarg->rel;
> > > > remotetypoid = rel->remoterel.atttyps[errarg->remote_attnum];
> > > >
> > > > /* Fetch remote type name from the LogicalRepTypMap cache */
> > > > remotetypname = logicalrep_typmap_gettypname(remotetypoid);
> > > >
> > > > /* Fetch local type OID from the local sys cache */
> > > > localtypoid = get_atttype(rel->localreloid, 
> > > > errarg->local_attnum + 1);
> > > >
> > > > errcontext("processing remote data for replication target 
> > > > relation \"%s.%s\" column \"%s\", "
> > > >"remote type %s, local type %s",
> > > >rel->remoterel.nspname, 
> > > > rel->remoterel.relname,
> > > >
> > > > rel->remoterel.attnames[errarg->remote_attnum],
> > > >remotetypname,
> > > >format_type_be(localtypoid));
> > > > }
> > > >
> > > >
> > > > that's not code that can run in an error context callback. It's
> > > > theoretically possible (but unlikely) that
> > > > logicalrep_typmap_gettypname() is safe to run in an error context
> > > > callback. But get_atttype() definitely isn't.
> > > >
> > > > get_attype() may do catalog accesses. That definitely can't happen
> > > > inside an error context callback - the entire transaction might be
> > > > borked at this point!
> > >
> > > You're right. Perhaps calling to format_type_be() is also dangerous
> > > since it does catalog access. We should have added the local type
> > > names to SlotErrCallbackArg so we avoid catalog access in the error
> > > context.
> > >
> > > I'll try to fix this.
> >
> > Attached the patch that fixes this issue.
> >
> > Since logicalrep_typmap_gettypname() could search the sys cache by
> > calling to format_type_be(), I stored both local and remote type names
> > to SlotErrCallbackArg so that we can just set the names in the error
> > callback without sys cache lookup.
> >
> > Please review it.
>
> Patch looks good, except one minor comment - can we store
> rel->remoterel.atttyps[remoteattnum] into a local variable and use it
> in logicalrep_typmap_gettypname, just to save on indentation?

Thank you for reviewing the patch!

Agreed. Attached the updated patch.

>
> I quickly searched in places where error callbacks are being used, I
> think we need a similar kind of fix for conversion_error_callback in
> postgres_fdw.c, because get_attname and get_rel_name are being used
> which do catalogue lookups. ISTM that all the other error callbacks
> are good in the sense that they are not doing sys catalogue lookups.

Indeed. If we need to disallow the catalog lookup during executing
error callbacks we might want to have an assertion checking that in
SearchCatCacheInternal(), in addition to Assert(IsTransactionState()).

Regards,

-- 
Masahiko Sawada
EnterpriseDB:  https://www.enterprisedb.com/


fix_slot_store_error_callback_v2.patch
Description: Binary data


Re: logical replication worker accesses catalogs in error context callback

2021-01-08 Thread Bharath Rupireddy
On Thu, Jan 7, 2021 at 5:47 AM Masahiko Sawada  wrote:
>
> On Wed, Jan 6, 2021 at 11:42 AM Masahiko Sawada  wrote:
> >
> > On Wed, Jan 6, 2021 at 11:02 AM Andres Freund  wrote:
> > >
> > > Hi,
> > >
> > > Due to a debug ereport I just noticed that worker.c's
> > > slot_store_error_callback is doing something quite dangerous:
> > >
> > > static void
> > > slot_store_error_callback(void *arg)
> > > {
> > > SlotErrCallbackArg *errarg = (SlotErrCallbackArg *) arg;
> > > LogicalRepRelMapEntry *rel;
> > > char   *remotetypname;
> > > Oid remotetypoid,
> > > localtypoid;
> > >
> > > /* Nothing to do if remote attribute number is not set */
> > > if (errarg->remote_attnum < 0)
> > > return;
> > >
> > > rel = errarg->rel;
> > > remotetypoid = rel->remoterel.atttyps[errarg->remote_attnum];
> > >
> > > /* Fetch remote type name from the LogicalRepTypMap cache */
> > > remotetypname = logicalrep_typmap_gettypname(remotetypoid);
> > >
> > > /* Fetch local type OID from the local sys cache */
> > > localtypoid = get_atttype(rel->localreloid, errarg->local_attnum 
> > > + 1);
> > >
> > > errcontext("processing remote data for replication target 
> > > relation \"%s.%s\" column \"%s\", "
> > >"remote type %s, local type %s",
> > >rel->remoterel.nspname, rel->remoterel.relname,
> > >rel->remoterel.attnames[errarg->remote_attnum],
> > >remotetypname,
> > >format_type_be(localtypoid));
> > > }
> > >
> > >
> > > that's not code that can run in an error context callback. It's
> > > theoretically possible (but unlikely) that
> > > logicalrep_typmap_gettypname() is safe to run in an error context
> > > callback. But get_atttype() definitely isn't.
> > >
> > > get_attype() may do catalog accesses. That definitely can't happen
> > > inside an error context callback - the entire transaction might be
> > > borked at this point!
> >
> > You're right. Perhaps calling to format_type_be() is also dangerous
> > since it does catalog access. We should have added the local type
> > names to SlotErrCallbackArg so we avoid catalog access in the error
> > context.
> >
> > I'll try to fix this.
>
> Attached the patch that fixes this issue.
>
> Since logicalrep_typmap_gettypname() could search the sys cache by
> calling to format_type_be(), I stored both local and remote type names
> to SlotErrCallbackArg so that we can just set the names in the error
> callback without sys cache lookup.
>
> Please review it.

Patch looks good, except one minor comment - can we store
rel->remoterel.atttyps[remoteattnum] into a local variable and use it
in logicalrep_typmap_gettypname, just to save on indentation?

I quickly searched in places where error callbacks are being used, I
think we need a similar kind of fix for conversion_error_callback in
postgres_fdw.c, because get_attname and get_rel_name are being used
which do catalogue lookups. ISTM that all the other error callbacks
are good in the sense that they are not doing sys catalogue lookups.

Thoughts?

With Regards,
Bharath Rupireddy.
EnterpriseDB: http://www.enterprisedb.com




Re: logical replication worker accesses catalogs in error context callback

2021-01-06 Thread Masahiko Sawada
On Wed, Jan 6, 2021 at 11:42 AM Masahiko Sawada  wrote:
>
> On Wed, Jan 6, 2021 at 11:02 AM Andres Freund  wrote:
> >
> > Hi,
> >
> > Due to a debug ereport I just noticed that worker.c's
> > slot_store_error_callback is doing something quite dangerous:
> >
> > static void
> > slot_store_error_callback(void *arg)
> > {
> > SlotErrCallbackArg *errarg = (SlotErrCallbackArg *) arg;
> > LogicalRepRelMapEntry *rel;
> > char   *remotetypname;
> > Oid remotetypoid,
> > localtypoid;
> >
> > /* Nothing to do if remote attribute number is not set */
> > if (errarg->remote_attnum < 0)
> > return;
> >
> > rel = errarg->rel;
> > remotetypoid = rel->remoterel.atttyps[errarg->remote_attnum];
> >
> > /* Fetch remote type name from the LogicalRepTypMap cache */
> > remotetypname = logicalrep_typmap_gettypname(remotetypoid);
> >
> > /* Fetch local type OID from the local sys cache */
> > localtypoid = get_atttype(rel->localreloid, errarg->local_attnum + 
> > 1);
> >
> > errcontext("processing remote data for replication target relation 
> > \"%s.%s\" column \"%s\", "
> >"remote type %s, local type %s",
> >rel->remoterel.nspname, rel->remoterel.relname,
> >rel->remoterel.attnames[errarg->remote_attnum],
> >remotetypname,
> >format_type_be(localtypoid));
> > }
> >
> >
> > that's not code that can run in an error context callback. It's
> > theoretically possible (but unlikely) that
> > logicalrep_typmap_gettypname() is safe to run in an error context
> > callback. But get_atttype() definitely isn't.
> >
> > get_attype() may do catalog accesses. That definitely can't happen
> > inside an error context callback - the entire transaction might be
> > borked at this point!
>
> You're right. Perhaps calling to format_type_be() is also dangerous
> since it does catalog access. We should have added the local type
> names to SlotErrCallbackArg so we avoid catalog access in the error
> context.
>
> I'll try to fix this.

Attached the patch that fixes this issue.

Since logicalrep_typmap_gettypname() could search the sys cache by
calling to format_type_be(), I stored both local and remote type names
to SlotErrCallbackArg so that we can just set the names in the error
callback without sys cache lookup.

Please review it.

Regards,

-- 
Masahiko Sawada
EnterpriseDB:  https://www.enterprisedb.com/


fix_slot_store_error_callback.patch
Description: Binary data


Re: logical replication worker accesses catalogs in error context callback

2021-01-05 Thread Masahiko Sawada
On Wed, Jan 6, 2021 at 11:02 AM Andres Freund  wrote:
>
> Hi,
>
> Due to a debug ereport I just noticed that worker.c's
> slot_store_error_callback is doing something quite dangerous:
>
> static void
> slot_store_error_callback(void *arg)
> {
> SlotErrCallbackArg *errarg = (SlotErrCallbackArg *) arg;
> LogicalRepRelMapEntry *rel;
> char   *remotetypname;
> Oid remotetypoid,
> localtypoid;
>
> /* Nothing to do if remote attribute number is not set */
> if (errarg->remote_attnum < 0)
> return;
>
> rel = errarg->rel;
> remotetypoid = rel->remoterel.atttyps[errarg->remote_attnum];
>
> /* Fetch remote type name from the LogicalRepTypMap cache */
> remotetypname = logicalrep_typmap_gettypname(remotetypoid);
>
> /* Fetch local type OID from the local sys cache */
> localtypoid = get_atttype(rel->localreloid, errarg->local_attnum + 1);
>
> errcontext("processing remote data for replication target relation 
> \"%s.%s\" column \"%s\", "
>"remote type %s, local type %s",
>rel->remoterel.nspname, rel->remoterel.relname,
>rel->remoterel.attnames[errarg->remote_attnum],
>remotetypname,
>format_type_be(localtypoid));
> }
>
>
> that's not code that can run in an error context callback. It's
> theoretically possible (but unlikely) that
> logicalrep_typmap_gettypname() is safe to run in an error context
> callback. But get_atttype() definitely isn't.
>
> get_attype() may do catalog accesses. That definitely can't happen
> inside an error context callback - the entire transaction might be
> borked at this point!

You're right. Perhaps calling to format_type_be() is also dangerous
since it does catalog access. We should have added the local type
names to SlotErrCallbackArg so we avoid catalog access in the error
context.

I'll try to fix this.

Regards,

-- 
Masahiko Sawada
EnterpriseDB:  https://www.enterprisedb.com/




logical replication worker accesses catalogs in error context callback

2021-01-05 Thread Andres Freund
Hi,

Due to a debug ereport I just noticed that worker.c's
slot_store_error_callback is doing something quite dangerous:

static void
slot_store_error_callback(void *arg)
{
SlotErrCallbackArg *errarg = (SlotErrCallbackArg *) arg;
LogicalRepRelMapEntry *rel;
char   *remotetypname;
Oid remotetypoid,
localtypoid;

/* Nothing to do if remote attribute number is not set */
if (errarg->remote_attnum < 0)
return;

rel = errarg->rel;
remotetypoid = rel->remoterel.atttyps[errarg->remote_attnum];

/* Fetch remote type name from the LogicalRepTypMap cache */
remotetypname = logicalrep_typmap_gettypname(remotetypoid);

/* Fetch local type OID from the local sys cache */
localtypoid = get_atttype(rel->localreloid, errarg->local_attnum + 1);

errcontext("processing remote data for replication target relation 
\"%s.%s\" column \"%s\", "
   "remote type %s, local type %s",
   rel->remoterel.nspname, rel->remoterel.relname,
   rel->remoterel.attnames[errarg->remote_attnum],
   remotetypname,
   format_type_be(localtypoid));
}


that's not code that can run in an error context callback. It's
theoretically possible (but unlikely) that
logicalrep_typmap_gettypname() is safe to run in an error context
callback. But get_atttype() definitely isn't.

get_attype() may do catalog accesses. That definitely can't happen
inside an error context callback - the entire transaction might be
borked at this point!


I think this was basically broken from the start in

commit 665d1fad99e7b11678b0d5fa24d2898424243cd6
Author: Peter Eisentraut 
Date:   2017-01-19 12:00:00 -0500

Logical replication

but then got a lot worse in

commit 24c0a6c649768f428929e76dd7f5012ec9b93ce1
Author: Alvaro Herrera 
Date:   2018-03-14 21:34:26 -0300

logical replication: fix OID type mapping mechanism

Greetings,

Andres Freund