From aeda2ae243c6bfb35e47154d4c9a703e15beb673 Mon Sep 17 00:00:00 2001
From: Jacob Champion <jacob.champion@enterprisedb.com>
Date: Thu, 17 Jul 2025 08:45:45 -0700
Subject: [PATCH v2 2/2] libpq: Drain all pending bytes from SSL/GSS during
 pqReadData()

The previous commit strengthened a workaround for a hang when large
messages are split across TLS records/GSS tokens. Because that
workaround is implemented in libpq internals, it can only help us when
libpq itself is polling on the socket. In nonblocking situations, where
the client above libpq is expected to poll, the same bugs can show up.

As a contrived example, consider a large protocol-2.0 error coming back
from a server during PQconnectPoll(), split in an odd way across two
records:

    -- TLS record (8192-byte payload) --
    EEEE[...repeated a total of 8192 times]
    -- TLS record (8193-byte payload) --
    EEEE[...repeated a total of 8192 times]\0

The first record will fill the first half of the libpq receive buffer,
which is 16k long by default. The second record completely fills the
last half with its first 8192 bytes, leaving the terminating NULL in the
OpenSSL buffer. Since we still haven't seen the terminator at our level,
PQconnectPoll() will return PGRES_POLLING_READING, expecting to come
back when the server has sent "the rest" of the data.  But there is
nothing left to read from the socket; OpenSSL had to pull all of the
data in the 8193-byte record off of the wire to decrypt it.

(A real server would probably not split up the records this way, nor
keep the connection open after sending a fatal connection error. But
servers that regularly use larger TLS records can get the libpq receive
buffer into the same state if DataRows are big enough, as reported on
the list.)

This is a layering violation. libpq makes decisions based on data in the
application buffer, above the transport buffer (whether SSL or GSS), but
clients are polling the socket, below the transport buffer. One way to
fix this in a backportable way, without changing APIs too much, is to
ensure data never stays in the transport buffer. Then pqReadData's
postconditions will look similar for both raw sockets and SSL/GSS: any
available data is either in the application buffer, or still on the
socket.

Building on the prior commit, add a function to the pqsecure API layer
which drains all pending data from the transport layer into
conn->inBuffer, expanding the buffer as necessary. pqReadData() calls
this function before returning when pending data exists. This is not
particularly efficient from an architectural perspective (the
pqsecure_read() implementations take care to fit their packets into the
current buffer, and that effort is now completely discarded), but it's
hopefully easier to reason about than a full rewrite would be for the
back branches.

Reported-by: Lars Kanis <lars@greiz-reinsdorf.de>
Discussion: https://postgr.es/m/2039ac58-d3e0-434b-ac1a-2a987f3b4cb1%40greiz-reinsdorf.de
Backpatch-through: 13
---
 src/interfaces/libpq/fe-misc.c           |  51 ++++++++++-
 src/interfaces/libpq/fe-secure-gssapi.c  |  27 ++++++
 src/interfaces/libpq/fe-secure-openssl.c | 109 +++++++++++++++++++++++
 src/interfaces/libpq/fe-secure.c         |  97 +++++++++++++++++++-
 src/interfaces/libpq/libpq-int.h         |   7 ++
 5 files changed, 288 insertions(+), 3 deletions(-)

diff --git a/src/interfaces/libpq/fe-misc.c b/src/interfaces/libpq/fe-misc.c
index 434216ff89f..49cf32e1bb7 100644
--- a/src/interfaces/libpq/fe-misc.c
+++ b/src/interfaces/libpq/fe-misc.c
@@ -55,6 +55,7 @@ static int	pqPutMsgBytes(const void *buf, size_t len, PGconn *conn);
 static int	pqSendSome(PGconn *conn, int len);
 static int	pqSocketCheck(PGconn *conn, int forRead, int forWrite,
 						  pg_usec_time_t end_time);
+static int	pqReadData_internal(PGconn *conn);
 
 /*
  * PQlibVersion: return the libpq version number
@@ -593,6 +594,13 @@ pqPutMsgEnd(PGconn *conn)
 
 /* ----------
  * pqReadData: read more data, if any is available
+ *
+ * Upon a successful return, callers may assume that either 1) all available
+ * bytes have been consumed from the socket, or 2) the socket is still marked
+ * readable by the OS. (In other words: after a successful pqReadData, it's safe
+ * to tell a client to poll for readable bytes on the socket without any further
+ * draining of the SSL/GSS transport buffers.)
+ *
  * Possible return values:
  *	 1: successfully loaded at least one more byte
  *	 0: no data is presently available, but no error detected
@@ -605,8 +613,8 @@ pqPutMsgEnd(PGconn *conn)
 int
 pqReadData(PGconn *conn)
 {
-	int			someread = 0;
-	int			nread;
+	int			available;
+	bool		pending;
 
 	if (conn->sock == PGINVALID_SOCKET)
 	{
@@ -614,6 +622,45 @@ pqReadData(PGconn *conn)
 		return -1;
 	}
 
+	available = pqReadData_internal(conn);
+	if (available < 0)
+		return -1;
+
+	/*
+	 * Make sure there are no bytes stuck in layers between conn->inBuffer and
+	 * the socket, to make it safe for clients to poll on PQsocket(). See
+	 * pqsecure_drain_pending's documentation for details.
+	 */
+	pending = pqsecure_read_is_pending(conn);
+
+	if (available && pending)
+	{
+		if (pqsecure_drain_pending(conn))
+			return -1;
+	}
+	else if (!available)
+	{
+		/*
+		 * If we're not returning any bytes from the underlying transport,
+		 * that must imply there aren't any in the transport buffer...
+		 */
+		Assert(!pending);
+	}
+
+	return available;
+}
+
+/*
+ * Workhorse for pqReadData(). It's kept separate from the
+ * pqsecure_drain_pending() logic to avoid adding to this function's goto
+ * complexity.
+ */
+static int
+pqReadData_internal(PGconn *conn)
+{
+	int			someread = 0;
+	int			nread;
+
 	/* Left-justify any data in the buffer to make room */
 	if (conn->inStart < conn->inEnd)
 	{
diff --git a/src/interfaces/libpq/fe-secure-gssapi.c b/src/interfaces/libpq/fe-secure-gssapi.c
index 7c88e64cfd2..329c2559b88 100644
--- a/src/interfaces/libpq/fe-secure-gssapi.c
+++ b/src/interfaces/libpq/fe-secure-gssapi.c
@@ -475,6 +475,33 @@ pg_GSS_read_is_pending(PGconn *conn)
 	return PqGSSResultLength > PqGSSResultNext;
 }
 
+int
+pg_GSS_drain_pending(PGconn *conn)
+{
+	int			pending;
+
+	/* Figure out how many bytes to take off the connection. */
+	Assert(PqGSSResultLength >= PqGSSResultNext);
+	pending = PqGSSResultLength - PqGSSResultNext;
+
+	if (!pending)
+	{
+		/* Nothing to do. */
+		return 0;
+	}
+
+	/* Expand the input buffer if necessary. */
+	if (pqCheckInBufferSpace(conn->inEnd + (size_t) pending, conn))
+		return -1;				/* errorMessage already set */
+
+	/* Now read the buffered data. */
+	memcpy(conn->inBuffer + conn->inEnd, PqGSSResultBuffer + PqGSSResultNext, pending);
+	conn->inEnd += pending;
+	PqGSSResultNext += pending;
+
+	return 0;
+}
+
 /*
  * Negotiate GSSAPI transport for a connection.  When complete, returns
  * PGRES_POLLING_OK.  Will return PGRES_POLLING_READING or
diff --git a/src/interfaces/libpq/fe-secure-openssl.c b/src/interfaces/libpq/fe-secure-openssl.c
index 8f975561e51..b95e229bd08 100644
--- a/src/interfaces/libpq/fe-secure-openssl.c
+++ b/src/interfaces/libpq/fe-secure-openssl.c
@@ -236,6 +236,115 @@ pgtls_read_is_pending(PGconn *conn)
 	return SSL_pending(conn->ssl) > 0;
 }
 
+/*
+ * Helper callback for use with ERR_print_errors_cb(). This appends the raw
+ * error queue to the provided PQExpBuffer, one entry per line.
+ *
+ * Note that this is not pretty output; it's meant for debugging.
+ */
+static int
+append_error_queue(const char *str, size_t len, void *u)
+{
+	PQExpBuffer buf = u;
+
+	appendBinaryPQExpBuffer(buf, str, len);
+	appendPQExpBufferChar(buf, '\n');
+
+	return 0;
+}
+
+int
+pgtls_drain_pending(PGconn *conn)
+{
+	int			pending;
+	size_t		drained;
+
+	/*
+	 * OpenSSL readahead is documented to break SSL_pending(). Plus, we can't
+	 * afford to have OpenSSL take bytes off the socket without processing
+	 * them; that breaks the postconditions for pqsecure_drain_pending().
+	 */
+	Assert(!SSL_get_read_ahead(conn->ssl));
+
+	/* Figure out how many bytes to take off the connection. */
+	pending = SSL_pending(conn->ssl);
+
+	if (!pending)
+	{
+		/* Nothing to do. */
+		return 0;
+	}
+	else if (pending < 0)
+	{
+		/* Shouldn't be possible, but don't let it mess up the math below. */
+		Assert(false);
+		libpq_append_conn_error(conn, "OpenSSL reports negative bytes pending");
+		return -1;
+	}
+	else if (pending == INT_MAX)
+	{
+		/*
+		 * If we ever found a legitimate way to hit this, we'd need to loop
+		 * around to call SSL_pending() again. Throw an error rather than
+		 * complicate the code in that way, because SSL_read() should be
+		 * bounded to the size of a single TLS record, and conn->inBuffer
+		 * can't currently go past INT_MAX in size anyway.
+		 */
+		libpq_append_conn_error(conn, "OpenSSL reports INT_MAX bytes pending");
+		return -1;
+	}
+
+	/* Expand the input buffer if necessary. */
+	if (pqCheckInBufferSpace(conn->inEnd + (size_t) pending, conn))
+		return -1;				/* errorMessage already set */
+
+	/*
+	 * Now read the buffered data.
+	 *
+	 * Don't defer to pgtls_read(); OpenSSL should guarantee that pending data
+	 * comes off in a single call, and we don't want to use the more
+	 * complicated read-loop behavior. We still have to manage the error
+	 * queue.
+	 */
+	ERR_clear_error();
+	if (!SSL_read_ex(conn->ssl, conn->inBuffer + conn->inEnd, pending, &drained))
+	{
+		int			err = SSL_get_error(conn->ssl, 0);
+
+		/*
+		 * Something is very wrong. Report the error code and the entirety of
+		 * the error queue without any attempt at interpretation. Probably not
+		 * worth complicating things for the sake of translation, either.
+		 */
+		appendPQExpBuffer(&conn->errorMessage,
+						  "unexpected error code %d while draining SSL buffer; ",
+						  err);
+
+		if (ERR_peek_error())
+		{
+			appendPQExpBufferStr(&conn->errorMessage, "error queue follows:\n");
+			ERR_print_errors_cb(append_error_queue, &conn->errorMessage);
+		}
+		else
+			appendPQExpBufferStr(&conn->errorMessage,
+								 "no error queue provided\n");
+
+		return -1;
+	}
+
+	/* Final consistency check. */
+	if (drained != pending)
+	{
+		libpq_append_conn_error(conn,
+								"drained only %zu of %d pending bytes in SSL buffer",
+								drained, pending);
+		return -1;
+	}
+
+	conn->inEnd += pending;
+	return 0;
+}
+
 ssize_t
 pgtls_write(PGconn *conn, const void *ptr, size_t len)
 {
diff --git a/src/interfaces/libpq/fe-secure.c b/src/interfaces/libpq/fe-secure.c
index 94c97ec26fb..82ae07be674 100644
--- a/src/interfaces/libpq/fe-secure.c
+++ b/src/interfaces/libpq/fe-secure.c
@@ -244,7 +244,9 @@ pqsecure_raw_read(PGconn *conn, void *ptr, size_t len)
 }
 
 /*
- * Returns true if there are any bytes available in the transport buffer.
+ * Returns true if there are any bytes available in the transport buffer. See
+ * pqsecure_drain_pending() for a more complete discussion of the concepts
+ * involved.
  */
 bool
 pqsecure_read_is_pending(PGconn *conn)
@@ -262,6 +264,99 @@ pqsecure_read_is_pending(PGconn *conn)
 	return 0;
 }
 
+/*---
+ * Drains any transport data that is already buffered in userspace and adds it
+ * to conn->inBuffer, enlarging inBuffer if necessary. The drain fails if
+ * inBuffer cannot be made to hold all available transport data.
+ *
+ * Implementations should not attempt to read any more data from the socket
+ * while draining the transport buffer. After a successful return,
+ * pqsecure_bytes_pending() must be zero.
+ *
+ * This operation is necessary to prevent deadlock, due to a layering violation
+ * designed into our asynchronous client API: pqReadData() and all the parsing
+ * routines above it receive data from the SSL/GSS transport buffer, but clients
+ * poll on the raw PQsocket() handle. So data can be "lost" in the intermediate
+ * layer if we don't take it out here.
+ *
+ * To illustrate what we're trying to prevent, say that the server is sending
+ * two messages at once in response to a query (Aaaa and Bb), the libpq buffer
+ * is five characters in size, and TLS records max out at three-character
+ * payloads.
+ *
+ *   Client    libpq      SSL      Socket
+ *     |         |         |         |
+ *     |      [     ]    [   ]     [   ]    [1] Buffers are empty, client is
+ *     x --------------------------> |          polling on socket
+ *     |         |         |         |
+ *     |      [     ]    [   ]     [xxx]    [2] First record is received; poll
+ *     | <-------------------------- |          signals read-ready
+ *     |         |         |         |
+ *     x ---> [     ]    [   ]     [xxx]    [3] Client calls PQconsumeInput()
+ *     |         |         |         |
+ *     |      [     ] -> [   ]     [xxx]    [4] libpq calls pqReadData() to fill
+ *     |         |         |         |          the receive buffer
+ *     |      [     ]    [Aaa] <-- [   ]    [5] SSL pulls payload off the wire
+ *     |         |         |         |          and decrypts it
+ *     |      [Aaa  ] <- [   ]     [   ]    [6] pqsecure_read() takes all data
+ *     |         |         |         |
+ *     | <--- [Aaa  ]    [   ]     [   ]    [7] PQconsumeInput() returns with a
+ *     x --------------------------> |          partial message, PQisBusy() is
+ *     |         |         |         |          still true, client polls again
+ *     |      [Aaa  ]    [   ]     [xxx]    [8] Second record is received; poll
+ *     | <-------------------------- |          signals read-ready
+ *     |         |         |         |
+ *     x ---> [Aaa  ]    [   ]     [xxx]    [9] Client calls PQconsumeInput()
+ *     |         |         |         |
+ *     |      [Aaa  ] -> [   ]     [xxx]   [10] libpq calls pqReadData() to fill
+ *     |         |         |         |          the receive buffer
+ *     |      [Aaa  ]    [aBb] <-- [   ]   [11] SSL decrypts
+ *     |         |         |         |
+ *     |      [AaaaB] <- [b  ]     [   ]   [12] pqsecure_read() fills its
+ *     |         |         |         |          buffer, taking only two bytes
+ *     | <--- [AaaaB]    [b  ]     [   ]   [13] PQconsumeInput() returns with a
+ *     |         |         |         |          complete message buffered;
+ *     |         |         |         |          PQisBusy() is false
+ *     x ---> [AaaaB]    [b  ]     [   ]   [14] Client calls PQgetResult()
+ *     |         |         |         |
+ *     | <--- [B    ]    [b  ]     [   ]   [15] Aaaa is returned; PQisBusy() is
+ *     x --------------------------> |          true and client polls again
+ *     .         |         |         .
+ *     .      [B    ]    [b  ]       .     [16] No packets, and client hangs.
+ *     .         |         |         .
+ *
+ */
+int
+pqsecure_drain_pending(PGconn *conn)
+{
+	int			ret;
+
+#ifdef USE_SSL
+	if (conn->ssl_in_use)
+	{
+		ret = pgtls_drain_pending(conn);
+	}
+	else
+#endif
+#ifdef ENABLE_GSS
+	if (conn->gssenc)
+	{
+		ret = pg_GSS_drain_pending(conn);
+	}
+	else
+#endif
+	{
+		/* Plaintext connections have no transport buffer. */
+		ret = 0;
+	}
+
+	/* Keep the implementation honest. */
+	if (ret == 0)
+		Assert(!pqsecure_read_is_pending(conn));
+
+	return ret;
+}
+
 /*
  *	Write data to a secure connection.
  *
diff --git a/src/interfaces/libpq/libpq-int.h b/src/interfaces/libpq/libpq-int.h
index c38c1ea0086..a0690a96a32 100644
--- a/src/interfaces/libpq/libpq-int.h
+++ b/src/interfaces/libpq/libpq-int.h
@@ -811,6 +811,7 @@ extern PostgresPollingStatusType pqsecure_open_client(PGconn *);
 extern void pqsecure_close(PGconn *);
 extern ssize_t pqsecure_read(PGconn *, void *ptr, size_t len);
 extern bool pqsecure_read_is_pending(PGconn *);
+extern int	pqsecure_drain_pending(PGconn *);
 extern ssize_t pqsecure_write(PGconn *, const void *ptr, size_t len);
 extern ssize_t pqsecure_raw_read(PGconn *, void *ptr, size_t len);
 extern ssize_t pqsecure_raw_write(PGconn *, const void *ptr, size_t len);
@@ -851,6 +852,11 @@ extern ssize_t pgtls_read(PGconn *conn, void *ptr, size_t len);
  */
 extern bool pgtls_read_is_pending(PGconn *conn);
 
+/*
+ *	Reads any data waiting in the SSL read buffer into the connection buffer.
+ */
+extern int	pgtls_drain_pending(PGconn *conn);
+
 /*
  *	Write data to a secure connection.
  *
@@ -898,6 +904,7 @@ extern PostgresPollingStatusType pqsecure_open_gss(PGconn *conn);
 extern ssize_t pg_GSS_write(PGconn *conn, const void *ptr, size_t len);
 extern ssize_t pg_GSS_read(PGconn *conn, void *ptr, size_t len);
 extern bool pg_GSS_read_is_pending(PGconn *conn);
+extern int	pg_GSS_drain_pending(PGconn *conn);
 #endif
 
 /* === in fe-trace.c === */
-- 
2.34.1

