Small improvements to substring()

Thomas Munro Sat, 14 Feb 2026 13:41:15 -0800

Hi,

As recently discussed on pgsql-bugs[1], substring() does unnecessary
work that can easily be deleted.  Here's a patch to try that.


[1] 
https://www.postgresql.org/message-id/flat/19406-9867fddddd724fca%40postgresql.org

From 4329204632d6321b2ada84a489933f146689428c Mon Sep 17 00:00:00 2001
From: Thomas Munro <[email protected]>
Date: Sun, 15 Feb 2026 10:38:37 +1300
Subject: [PATCH v1] Improve substring() implementation.

Previously it made two passes over multibyte strings, one to count the
characters up until the required limit, and another to search for the
user-specified range to copy out.  The first pass can be folded into the
second.

The only user-visible effect of this change requires a corrupted string
containing internal NUL, as shown in tests.  Previously, substring()
would consider NUL to be a terminator and stop there in the (now
removed) first scan.  Now it considers it to be a character, and
propagates it into the return value.  Neither behavior is wrong per
contract, so long as PostgreSQL continues its blanket ban on NUL in the
"text" data type (and if that ever changed, this new behavior would
presumably be the only correct one).

While here, remove special cases to handle empty strings.  The general
algorithm already has the same effect without extra code or cycles.

These behaviors are exercised by existing tests.

Reviewed-by:
Discussion:
---
 src/backend/utils/adt/varlena.c        | 132 +++++--------------------
 src/test/regress/expected/encoding.out |  28 +++---
 src/test/regress/sql/encoding.sql      |   7 +-
 3 files changed, 39 insertions(+), 128 deletions(-)

diff --git a/src/backend/utils/adt/varlena.c b/src/backend/utils/adt/varlena.c
index d8ea7f45bcc..b8467a7ce8c 100644
--- a/src/backend/utils/adt/varlena.c
+++ b/src/backend/utils/adt/varlena.c
@@ -133,7 +133,6 @@ static text *text_substring(Datum str,
 							int32 start,
 							int32 length,
 							bool length_not_specified);
-static int	pg_mbcharcliplen_chars(const char *mbstr, int len, int limit);
 static text *text_overlay(text *t1, text *t2, int sp, int sl);
 static int	text_position(text *t1, text *t2, Oid collid);
 static void text_position_setup(text *t1, text *t2, Oid collid, TextPositionState *state);
@@ -645,34 +644,27 @@ text_substring(Datum str, int32 start, int32 length, bool length_not_specified)
 		 * detoasting, so we'll grab a conservatively large slice now and go
 		 * back later to do the right thing
 		 */
-		int32		slice_start;
 		int32		slice_size;
-		int32		slice_strlen;
-		int32		slice_len;
+		const char *slice_end;
 		text	   *slice;
-		int32		E1;
 		int32		i;
 		char	   *p;
 		char	   *s;
 		text	   *ret;
 
-		/*
-		 * We need to start at position zero because there is no way to know
-		 * in advance which byte offset corresponds to the supplied start
-		 * position.
-		 */
-		slice_start = 0;
-
-		if (length_not_specified)	/* special case - get length to end of
-									 * string */
-			slice_size = L1 = -1;
+		if (length_not_specified)	/* special case - get whole string */
+		{
+			slice_size = -1;
+			E = INT32_MAX;
+		}
 		else if (length < 0)
 		{
 			/* SQL99 says to throw an error for E < S, i.e., negative length */
 			ereport(ERROR,
 					(errcode(ERRCODE_SUBSTRING_ERROR),
 					 errmsg("negative substring length not allowed")));
-			slice_size = L1 = -1;	/* silence stupider compilers */
+			slice_size = -1;	/* silence stupider compilers */
+			E = INT32_MAX;
 		}
 		else if (pg_add_s32_overflow(S, length, &E))
 		{
@@ -680,24 +672,11 @@ text_substring(Datum str, int32 start, int32 length, bool length_not_specified)
 			 * L could be large enough for S + L to overflow, in which case
 			 * the substring must run to end of string.
 			 */
-			slice_size = L1 = -1;
+			slice_size = -1;
+			E = INT32_MAX;
 		}
 		else
 		{
-			/*
-			 * Ending at position 1, exclusive, obviously yields an empty
-			 * string.  A zero or negative value can happen if the start was
-			 * negative or one. SQL99 says to return a zero-length string.
-			 */
-			if (E <= 1)
-				return cstring_to_text("");
-
-			/*
-			 * if E is past the end of the string, the tuple toaster will
-			 * truncate the length for us
-			 */
-			L1 = E - S1;
-
 			/*
 			 * Total slice size in bytes can't be any longer than the
 			 * inclusive end position times the encoding max length.  If that
@@ -713,71 +692,39 @@ text_substring(Datum str, int32 start, int32 length, bool length_not_specified)
 		 */
 		if (VARATT_IS_COMPRESSED(DatumGetPointer(str)) ||
 			VARATT_IS_EXTERNAL(DatumGetPointer(str)))
-			slice = DatumGetTextPSlice(str, slice_start, slice_size);
+			slice = DatumGetTextPSlice(str, 0, slice_size);
 		else
 			slice = (text *) DatumGetPointer(str);
 
-		/* see if we got back an empty string */
-		slice_len = VARSIZE_ANY_EXHDR(slice);
-		if (slice_len == 0)
-		{
-			if (slice != (text *) DatumGetPointer(str))
-				pfree(slice);
-			return cstring_to_text("");
-		}
-
-		/*
-		 * Now we can get the actual length of the slice in MB characters,
-		 * stopping at the end of the substring.  Continuing beyond the
-		 * substring end could find an incomplete character attributable
-		 * solely to DatumGetTextPSlice() chopping in the middle of a
-		 * character, and it would be superfluous work at best.
-		 */
-		slice_strlen =
-			(slice_size == -1 ?
-			 pg_mbstrlen_with_len(VARDATA_ANY(slice), slice_len) :
-			 pg_mbcharcliplen_chars(VARDATA_ANY(slice), slice_len, E - 1));
-
 		/*
-		 * Check that the start position wasn't > slice_strlen. If so, SQL99
-		 * says to return a zero-length string.
+		 * pg_mblen_range() validates lengths and raises an encoding error if
+		 * we make it all the way to slice_end and find a partial (corrupted)
+		 * sequence there.
 		 */
-		if (S1 > slice_strlen)
-		{
-			if (slice != (text *) DatumGetPointer(str))
-				pfree(slice);
-			return cstring_to_text("");
-		}
-
-		/*
-		 * Adjust L1 and E1 now that we know the slice string length. Again
-		 * remember that S1 is one based, and slice_start is zero based.
-		 */
-		if (L1 > -1)
-			E1 = Min(S1 + L1, slice_start + 1 + slice_strlen);
-		else
-			E1 = slice_start + 1 + slice_strlen;
+		p = VARDATA_ANY(slice);
+		slice_end = p + VARSIZE_ANY_EXHDR(slice);
 
 		/*
-		 * Find the start position in the slice; remember S1 is not zero based
+		 * Find the start position in the slice; remember S1 is not zero
+		 * based. If we run out of input first, we'll return an empty string.
 		 */
-		p = VARDATA_ANY(slice);
-		for (i = 0; i < S1 - 1; i++)
-			p += pg_mblen_unbounded(p);
+		for (i = 1; i < S1 && p < slice_end; i++)
+			p += pg_mblen_range(p, slice_end);
 
 		/* hang onto a pointer to our start position */
 		s = p;
 
 		/*
 		 * Count the actual bytes used by the substring of the requested
-		 * length.
+		 * length, again giving up if we run out of input.
 		 */
-		for (i = S1; i < E1; i++)
-			p += pg_mblen_unbounded(p);
+		for (i = S1; i < E && p < slice_end; i++)
+			p += pg_mblen_range(p, slice_end);
 
 		ret = (text *) palloc(VARHDRSZ + (p - s));
 		SET_VARSIZE(ret, VARHDRSZ + (p - s));
-		memcpy(VARDATA(ret), s, (p - s));
+		if (s < p)
+			memcpy(VARDATA(ret), s, (p - s));
 
 		if (slice != (text *) DatumGetPointer(str))
 			pfree(slice);
@@ -791,35 +738,6 @@ text_substring(Datum str, int32 start, int32 length, bool length_not_specified)
 	return NULL;
 }
 
-/*
- * pg_mbcharcliplen_chars -
- *	Mirror pg_mbcharcliplen(), except return value unit is chars, not bytes.
- *
- *	This mirrors all the dubious historical behavior, so it's static to
- *	discourage proliferation.  The assertions are specific to the one caller.
- */
-static int
-pg_mbcharcliplen_chars(const char *mbstr, int len, int limit)
-{
-	int			nch = 0;
-	int			l;
-
-	Assert(len > 0);
-	Assert(limit > 0);
-	Assert(pg_database_encoding_max_length() > 1);
-
-	while (len > 0 && *mbstr)
-	{
-		l = pg_mblen_with_len(mbstr, len);
-		nch++;
-		if (nch == limit)
-			break;
-		len -= l;
-		mbstr += l;
-	}
-	return nch;
-}
-
 /*
  * textoverlay
  *	Replace specified substring of first string with second
diff --git a/src/test/regress/expected/encoding.out b/src/test/regress/expected/encoding.out
index cac1cb74782..eec879b05ec 100644
--- a/src/test/regress/expected/encoding.out
+++ b/src/test/regress/expected/encoding.out
@@ -90,6 +90,13 @@ SELECT length(with_nul) FROM regress_encoding;
       4
 (1 row)
 
+SELECT regexp_replace(with_nul, '^caf(.)$', '\1') FROM regress_encoding;
+ regexp_replace 
+----------------
+ é
+(1 row)
+
+-- NUL = character
 SELECT substring(with_nul, 3, 1) FROM regress_encoding;
  substring 
 -----------
@@ -102,25 +109,12 @@ SELECT substring(with_nul, 4, 1) FROM regress_encoding;
  é
 (1 row)
 
-SELECT substring(with_nul, 5, 1) FROM regress_encoding;
- substring 
------------
- 
-(1 row)
-
-SELECT convert_to(substring(with_nul, 5, 1), 'UTF8') FROM regress_encoding;
- convert_to 
-------------
- \x
-(1 row)
-
-SELECT regexp_replace(with_nul, '^caf(.)$', '\1') FROM regress_encoding;
- regexp_replace 
-----------------
- é
+SELECT substring(with_nul, 5, 1) = test_bytea_to_text('\x00') FROM regress_encoding;
+ ?column? 
+----------
+ t
 (1 row)
 
--- NUL = character
 SELECT with_nul, reverse(with_nul), reverse(reverse(with_nul)) FROM regress_encoding;
  with_nul | reverse | reverse 
 ----------+---------+---------
diff --git a/src/test/regress/sql/encoding.sql b/src/test/regress/sql/encoding.sql
index 782f90f0d62..7de95f49947 100644
--- a/src/test/regress/sql/encoding.sql
+++ b/src/test/regress/sql/encoding.sql
@@ -52,12 +52,11 @@ SELECT regexp_replace(truncated, '^caf(.)$', '\1') FROM regress_encoding;
 
 -- NUL = terminator
 SELECT length(with_nul) FROM regress_encoding;
-SELECT substring(with_nul, 3, 1) FROM regress_encoding;
-SELECT substring(with_nul, 4, 1) FROM regress_encoding;
-SELECT substring(with_nul, 5, 1) FROM regress_encoding;
-SELECT convert_to(substring(with_nul, 5, 1), 'UTF8') FROM regress_encoding;
 SELECT regexp_replace(with_nul, '^caf(.)$', '\1') FROM regress_encoding;
 -- NUL = character
+SELECT substring(with_nul, 3, 1) FROM regress_encoding;
+SELECT substring(with_nul, 4, 1) FROM regress_encoding;
+SELECT substring(with_nul, 5, 1) = test_bytea_to_text('\x00') FROM regress_encoding;
 SELECT with_nul, reverse(with_nul), reverse(reverse(with_nul)) FROM regress_encoding;
 
 -- If a corrupted string contains NUL in the tail bytes of a multibyte
-- 
2.47.3

Small improvements to substring()

Reply via email to