From e96aab665c33b2a8f198fe802dae7ab410a01471 Mon Sep 17 00:00:00 2001
From: Jacob Champion <jacob.champion@enterprisedb.com>
Date: Fri, 6 Jun 2025 15:22:41 -0700
Subject: [PATCH 3/4] oauth: Track total call count during a client flow

Tracking down the bugs that led to the addition of drain_socket_events()
and drain_timer_events() was difficult, because an inefficient flow is
not visibly different from one that is working properly. To help
maintainers notice when something has gone wrong, track the number of
calls into the flow as part of debug mode, and print the total when the
flow finishes.

A new test makes sure the total count is less than 100. (We expect
something on the order of 10.) This isn't foolproof, but it is able to
catch several regressions in the logic of the prior two commits, and
future work to add TLS support to the oauth_validator test server should
strengthen it as well.
---
 src/interfaces/libpq-oauth/oauth-curl.c       | 22 +++++++++++++
 .../modules/oauth_validator/t/001_server.pl   | 31 ++++++++++++++++++-
 2 files changed, 52 insertions(+), 1 deletion(-)

diff --git a/src/interfaces/libpq-oauth/oauth-curl.c b/src/interfaces/libpq-oauth/oauth-curl.c
index 78ba3399495..68303106a5d 100644
--- a/src/interfaces/libpq-oauth/oauth-curl.c
+++ b/src/interfaces/libpq-oauth/oauth-curl.c
@@ -278,6 +278,7 @@ struct async_ctx
 	bool		user_prompted;	/* have we already sent the authz prompt? */
 	bool		used_basic_auth;	/* did we send a client secret? */
 	bool		debugging;		/* can we give unsafe developer assistance? */
+	int			dbg_num_calls;	/* (debug mode) how many times were we called? */
 
 #if defined(HAVE_SYS_EVENT_H)
 	int			nevents;		/* how many events are we waiting on? */
@@ -3074,6 +3075,8 @@ PostgresPollingStatusType
 pg_fe_run_oauth_flow(PGconn *conn)
 {
 	PostgresPollingStatusType result;
+	fe_oauth_state *state = conn_sasl_state(conn);
+	struct async_ctx *actx;
 #ifndef WIN32
 	sigset_t	osigset;
 	bool		sigpipe_pending;
@@ -3102,6 +3105,25 @@ pg_fe_run_oauth_flow(PGconn *conn)
 
 	result = pg_fe_run_oauth_flow_impl(conn);
 
+	/*
+	 * To assist with finding bugs in drain_socket_events() and
+	 * drain_timer_events(), when we're in debug mode, track the total number
+	 * of calls to this function and print that at the end of the flow.
+	 *
+	 * Be careful that state->async_ctx could be NULL if early initialization
+	 * fails during the first call.
+	 */
+	actx = state->async_ctx;
+	Assert(actx || result == PGRES_POLLING_FAILED);
+
+	if (actx && actx->debugging)
+	{
+		actx->dbg_num_calls++;
+		if (result == PGRES_POLLING_OK || result == PGRES_POLLING_FAILED)
+			fprintf(stderr, "[libpq] total number of polls: %d\n",
+					actx->dbg_num_calls);
+	}
+
 #ifndef WIN32
 	if (masked)
 	{
diff --git a/src/test/modules/oauth_validator/t/001_server.pl b/src/test/modules/oauth_validator/t/001_server.pl
index 41672ebd5c6..c0dafb8be76 100644
--- a/src/test/modules/oauth_validator/t/001_server.pl
+++ b/src/test/modules/oauth_validator/t/001_server.pl
@@ -418,6 +418,35 @@ $node->connect_fails(
 	  qr/failed to obtain access token: mutual TLS required for client \(invalid_client\)/
 );
 
+# Count the number of calls to the internal flow when multiple retries are
+# triggered. The exact number depends on many things -- the TCP stack, the
+# version of Curl in use, random chance -- but a ridiculously high number
+# suggests something is wrong with our ability to clear multiplexer events after
+# they're no longer applicable.
+my ($ret, $stdout, $stderr) = $node->psql(
+	'postgres',
+	"SELECT 'connected for call count'",
+	extra_params => ['-w'],
+	connstr => connstr(stage => 'token', retries => 2),
+	on_error_stop => 0);
+
+is($ret, 0, "call count connection succeeds");
+like(
+	$stderr,
+	qr@Visit https://example\.com/ and enter the code: postgresuser@,
+	"call count: stderr matches");
+
+my $count_pattern = qr/\[libpq\] total number of polls: (\d+)/;
+if (like($stderr, $count_pattern, "call count: count is printed"))
+{
+	# For reference, a typical flow with two retries might take between 5-15
+	# calls to the client implementation. And while this will probably continue
+	# to change across OSes and Curl updates, we're likely in trouble if we see
+	# hundreds or thousands of calls.
+	$stderr =~ $count_pattern;
+	cmp_ok($1, '<', 100, "call count is reasonably small");
+}
+
 # Stress test: make sure our builtin flow operates correctly even if the client
 # application isn't respecting PGRES_POLLING_READING/WRITING signals returned
 # from PQconnectPoll().
@@ -428,7 +457,7 @@ my @cmd = (
 	connstr(stage => 'all', retries => 1, interval => 1));
 
 note "running '" . join("' '", @cmd) . "'";
-my ($stdout, $stderr) = run_command(\@cmd);
+($stdout, $stderr) = run_command(\@cmd);
 
 like($stdout, qr/connection succeeded/, "stress-async: stdout matches");
 unlike(
-- 
2.34.1

