On 2020/09/25 13:56, Bharath Rupireddy wrote:
On Wed, Sep 23, 2020 at 8:19 PM Fujii Masao <masao.fu...@oss.nttdata.com> wrote:

Please let me know if okay with the above agreed points, I will work on the new 
patch.

Yes, please work on the patch! Thanks! I may revisit the above points later, 
though ;)


Thanks, attaching v4 patch. Please consider this for further review.

Thanks for updating the patch!

In the orignal code, disconnect_pg_server() is called when invalidation
occurs and connect_pg_server() is called when no valid connection exists.
I think that we can simplify the code by merging the connection-retry
code into them, like the attached very WIP patch (based on yours) does.
Basically I'd like to avoid duplicating disconnect_pg_server(),
connect_pg_server() and begin_remote_xact() if possible. Thought?

Your patch adds several codes into PG_CATCH() section, but it's better
 to keep that section simple enough (as the source comment for
PG_CATCH() explains). So what about moving some codes out of PG_CATCH()
section?

+                       else
+                               ereport(ERROR,
+                                       
(errcode(ERRCODE_SQLCLIENT_UNABLE_TO_ESTABLISH_SQLCONNECTION),
+                                        errmsg("could not connect to server 
\"%s\"",
+                                                       server->servername),
+                                        errdetail_internal("%s", 
pchomp(PQerrorMessage(entry->conn)))));

The above is not necessary? If this error occurs, connect_pg_server()
reports an error, before the above code is reached. Right?

Regards,

--
Fujii Masao
Advanced Computing Technology Center
Research and Development Headquarters
NTT DATA CORPORATION
diff --git a/contrib/postgres_fdw/connection.c 
b/contrib/postgres_fdw/connection.c
index 08daf26fdf..9f76261d99 100644
--- a/contrib/postgres_fdw/connection.c
+++ b/contrib/postgres_fdw/connection.c
@@ -108,6 +108,7 @@ PGconn *
 GetConnection(UserMapping *user, bool will_prep_stmt)
 {
        bool            found;
+       bool            need_reset_conn = false;
        ConnCacheEntry *entry;
        ConnCacheKey key;
 
@@ -159,23 +160,25 @@ GetConnection(UserMapping *user, bool will_prep_stmt)
        /* Reject further use of connections which failed abort cleanup. */
        pgfdw_reject_incomplete_xact_state_change(entry);
 
+reset:
        /*
         * If the connection needs to be remade due to invalidation, disconnect 
as
-        * soon as we're out of all transactions.
+        * soon as we're out of all transactions. Also if previous attempt to 
start
+        * new remote transaction failed on the cached connection, disconnect
+        * it to reestablish new connection.
         */
-       if (entry->conn != NULL && entry->invalidated && entry->xact_depth == 0)
+       if ((entry->conn != NULL && entry->invalidated && entry->xact_depth == 
0) ||
+               need_reset_conn)
        {
-               elog(DEBUG3, "closing connection %p for option changes to take 
effect",
-                        entry->conn);
+               if (need_reset_conn)
+                       elog(DEBUG3, "closing connection %p to reestablish 
connection",
+                                entry->conn);
+               else
+                       elog(DEBUG3, "closing connection %p for option changes 
to take effect",
+                                entry->conn);
                disconnect_pg_server(entry);
        }
 
-       /*
-        * We don't check the health of cached connection here, because it would
-        * require some overhead.  Broken connection will be detected when the
-        * connection is actually used.
-        */
-
        /*
         * If cache entry doesn't have a connection, we have to establish a new
         * connection.  (If connect_pg_server throws an error, the cache entry
@@ -206,9 +209,31 @@ GetConnection(UserMapping *user, bool will_prep_stmt)
        }
 
        /*
-        * Start a new transaction or subtransaction if needed.
+        * We check the health of the cached connection here, while the remote
+        * xact is going to begin.  Broken connection will be detected and the
+        * connection is retried. We do this only once during each begin remote
+        * xact call, if the connection is still broken after the retry attempt,
+        * then the query will fail.
         */
-       begin_remote_xact(entry);
+       PG_TRY();
+       {
+               /* Start a new transaction or subtransaction if needed. */
+               begin_remote_xact(entry);
+               need_reset_conn = false;
+       }
+       PG_CATCH();
+       {
+               if (!entry->conn ||
+                       PQstatus(entry->conn) != CONNECTION_BAD ||
+                       entry->xact_depth > 0 ||
+                       need_reset_conn)
+                       PG_RE_THROW();
+               need_reset_conn = true;
+       }
+       PG_END_TRY();
+
+       if (need_reset_conn)
+               goto reset;
 
        /* Remember if caller will prepare statements */
        entry->have_prep_stmt |= will_prep_stmt;
diff --git a/contrib/postgres_fdw/expected/postgres_fdw.out 
b/contrib/postgres_fdw/expected/postgres_fdw.out
index 10e23d02ed..d72b11a3ac 100644
--- a/contrib/postgres_fdw/expected/postgres_fdw.out
+++ b/contrib/postgres_fdw/expected/postgres_fdw.out
@@ -8987,3 +8987,58 @@ PREPARE TRANSACTION 'fdw_tpc';
 ERROR:  cannot PREPARE a transaction that has operated on postgres_fdw foreign 
tables
 ROLLBACK;
 WARNING:  there is no transaction in progress
+-- Retry cached connections at the beginning of the remote xact
+-- in case remote backend is killed.
+-- Let's use a different application name for remote connection,
+-- so that this test will not kill other backends wrongly.
+ALTER SERVER loopback OPTIONS (application_name 'fdw_retry_check');
+-- Generate a connection to remote. Local backend will cache it.
+SELECT * FROM ft1 LIMIT 1;
+  c1  | c2 | c3 | c4 | c5 | c6 |     c7     | c8 
+------+----+----+----+----+----+------------+----
+ 1111 |  2 |    |    |    |    | ft1        | 
+(1 row)
+
+-- Retrieve pid of remote backend with application name fdw_retry_check
+-- and kill it intentionally here. Note that, local backend still has
+-- the remote connection/backend info in it's cache.
+SELECT pg_terminate_backend(pid) FROM pg_stat_activity WHERE
+backend_type = 'client backend' AND application_name = 'fdw_retry_check';
+ pg_terminate_backend 
+----------------------
+ t
+(1 row)
+
+-- Next query using the same foreign server should succeed if connection
+-- retry works.
+SELECT * FROM ft1 LIMIT 1;
+  c1  | c2 | c3 | c4 | c5 | c6 |     c7     | c8 
+------+----+----+----+----+----+------------+----
+ 1111 |  2 |    |    |    |    | ft1        | 
+(1 row)
+
+-- Subtransaction - remote backend is killed in the middle of a remote
+-- xact, and we don't do retry connection, hence the subsequent query
+-- using the same foreign server should fail.
+BEGIN;
+DECLARE c CURSOR FOR SELECT * FROM ft1 LIMIT 1;
+FETCH c;
+  c1  | c2 | c3 | c4 | c5 | c6 |     c7     | c8 
+------+----+----+----+----+----+------------+----
+ 1111 |  2 |    |    |    |    | ft1        | 
+(1 row)
+
+SAVEPOINT s;
+SELECT pg_terminate_backend(pid) FROM pg_stat_activity WHERE
+backend_type = 'client backend' AND application_name = 'fdw_retry_check';
+ pg_terminate_backend 
+----------------------
+ t
+(1 row)
+
+SELECT * FROM ft1 LIMIT 1;    -- should fail
+ERROR:  server closed the connection unexpectedly
+       This probably means the server terminated abnormally
+       before or while processing the request.
+CONTEXT:  remote SQL command: SAVEPOINT s2
+COMMIT;
diff --git a/contrib/postgres_fdw/sql/postgres_fdw.sql 
b/contrib/postgres_fdw/sql/postgres_fdw.sql
index 78156d10b4..7a1b4c78bd 100644
--- a/contrib/postgres_fdw/sql/postgres_fdw.sql
+++ b/contrib/postgres_fdw/sql/postgres_fdw.sql
@@ -2653,3 +2653,34 @@ SELECT count(*) FROM ft1;
 -- error here
 PREPARE TRANSACTION 'fdw_tpc';
 ROLLBACK;
+
+-- Retry cached connections at the beginning of the remote xact
+-- in case remote backend is killed.
+-- Let's use a different application name for remote connection,
+-- so that this test will not kill other backends wrongly.
+ALTER SERVER loopback OPTIONS (application_name 'fdw_retry_check');
+
+-- Generate a connection to remote. Local backend will cache it.
+SELECT * FROM ft1 LIMIT 1;
+
+-- Retrieve pid of remote backend with application name fdw_retry_check
+-- and kill it intentionally here. Note that, local backend still has
+-- the remote connection/backend info in it's cache.
+SELECT pg_terminate_backend(pid) FROM pg_stat_activity WHERE
+backend_type = 'client backend' AND application_name = 'fdw_retry_check';
+
+-- Next query using the same foreign server should succeed if connection
+-- retry works.
+SELECT * FROM ft1 LIMIT 1;
+
+-- Subtransaction - remote backend is killed in the middle of a remote
+-- xact, and we don't do retry connection, hence the subsequent query
+-- using the same foreign server should fail.
+BEGIN;
+DECLARE c CURSOR FOR SELECT * FROM ft1 LIMIT 1;
+FETCH c;
+SAVEPOINT s;
+SELECT pg_terminate_backend(pid) FROM pg_stat_activity WHERE
+backend_type = 'client backend' AND application_name = 'fdw_retry_check';
+SELECT * FROM ft1 LIMIT 1;    -- should fail
+COMMIT;

Reply via email to