From 76a13f46fb07903d67f980b552346ebe6b3a8fe2 Mon Sep 17 00:00:00 2001
From: Andrey Borodin <amborodin@acm.org>
Date: Wed, 29 Oct 2025 15:53:12 +0400
Subject: [PATCH v3 3/3] Add base32hex encoding support to encode() and
 decode()

Implement base32hex encoding/decoding per RFC 4648 Section 7 for
encode() and decode() functions. This encoding uses the extended hex
alphabet (0-9, A-V) which preserves sort order.

The encode() function produces unpadded output, while decode() accepts
both padded and unpadded input. Decoding is case-insensitive.

This is particularly useful for encoding UUIDs compactly:
    SELECT encode(uuid_value::bytea, 'base32hex');
produces a 26-character string compared to the standard 36-character
UUID representation.

Suggested-by: Sergey Prokhorenko<sergeyprokhorenko@yahoo.com.au>
---
 doc/src/sgml/func/func-binarystring.sgml |  25 +++++
 src/backend/utils/adt/encode.c           | 124 +++++++++++++++++++++++
 src/test/regress/expected/uuid.out       |  88 ++++++++++++++++
 src/test/regress/sql/uuid.sql            |  27 +++++
 4 files changed, 264 insertions(+)

diff --git a/doc/src/sgml/func/func-binarystring.sgml b/doc/src/sgml/func/func-binarystring.sgml
index dd7037811af..4659ad49787 100644
--- a/doc/src/sgml/func/func-binarystring.sgml
+++ b/doc/src/sgml/func/func-binarystring.sgml
@@ -729,6 +729,7 @@
        <parameter>format</parameter> values are:
        <link linkend="encode-format-base64"><literal>base64</literal></link>,
        <link linkend="encode-format-base64url"><literal>base64url</literal></link>,
+       <link linkend="encode-format-base32hex"><literal>base32hex</literal></link>,
        <link linkend="encode-format-escape"><literal>escape</literal></link>,
        <link linkend="encode-format-hex"><literal>hex</literal></link>.
       </para>
@@ -804,6 +805,30 @@
      </listitem>
     </varlistentry>
 
+    <varlistentry id="encode-format-base32hex">
+     <term>base32hex
+      <indexterm>
+       <primary>base32hex format</primary>
+      </indexterm></term>
+     <listitem>
+      <para>
+       The <literal>base32hex</literal> format is that of
+       <ulink url="https://datatracker.ietf.org/doc/html/rfc4648#section-7">
+       RFC 4648 Section 7</ulink>.  It uses the extended hex alphabet
+       (0-9, A-V) which preserves sort order when encoding binary data.
+       The <function>encode</function> function produces unpadded output,
+       while <function>decode</function> accepts both padded and unpadded
+       input. Decoding is case-insensitive and ignores whitespace characters.
+      </para>
+      <para>
+       This format is particularly useful for encoding UUIDs in a compact,
+       sortable format: <literal>encode(uuid_value::bytea, 'base32hex')</literal>
+       produces a 26-character string compared to the standard 36-character
+       UUID representation.
+      </para>
+     </listitem>
+    </varlistentry>
+
     <varlistentry id="encode-format-escape">
      <term>escape
      <indexterm>
diff --git a/src/backend/utils/adt/encode.c b/src/backend/utils/adt/encode.c
index aabe9913eee..c31ab60d4b7 100644
--- a/src/backend/utils/adt/encode.c
+++ b/src/backend/utils/adt/encode.c
@@ -821,6 +821,124 @@ esc_dec_len(const char *src, size_t srclen)
 	return len;
 }
 
+/*
+ * BASE32HEX
+ */
+
+static const char base32hex_table[] = "0123456789ABCDEFGHIJKLMNOPQRSTUV";
+
+static uint64
+base32hex_enc_len(const char *src, size_t srclen)
+{
+	/* 5 bits per base32hex character, so round up (srclen * 8 + 4) / 5 */
+	return ((uint64) srclen * 8 + 4) / 5;
+}
+
+static uint64
+base32hex_dec_len(const char *src, size_t srclen)
+{
+	/* Decode length is (srclen * 5) / 8, but we may have padding */
+	return ((uint64) srclen * 5) / 8;
+}
+
+static uint64
+base32hex_encode(const char *src, size_t srclen, char *dst)
+{
+	const unsigned char *data = (const unsigned char *) src;
+	uint64		bits_buffer = 0;
+	int			bits_in_buffer = 0;
+	uint64		output_pos = 0;
+	size_t		i;
+
+	for (i = 0; i < srclen; i++)
+	{
+		/* Add 8 bits to the buffer */
+		bits_buffer = (bits_buffer << 8) | data[i];
+		bits_in_buffer += 8;
+
+		/* Extract 5-bit chunks while we have enough bits */
+		while (bits_in_buffer >= 5)
+		{
+			bits_in_buffer -= 5;
+			/* Extract top 5 bits */
+			dst[output_pos++] = base32hex_table[(bits_buffer >> bits_in_buffer) & 0x1F];
+			/* Clear the extracted bits by masking */
+			bits_buffer &= ((1ULL << bits_in_buffer) - 1);
+		}
+	}
+
+	/* Handle remaining bits (if any) */
+	if (bits_in_buffer > 0)
+	{
+		dst[output_pos++] = base32hex_table[(bits_buffer << (5 - bits_in_buffer)) & 0x1F];
+	}
+
+	return output_pos;
+}
+
+static uint64
+base32hex_decode(const char *src, size_t srclen, char *dst)
+{
+	const unsigned char *data = (const unsigned char *) src;
+	uint64		bits_buffer = 0;
+	int			bits_in_buffer = 0;
+	uint64		output_pos = 0;
+	size_t		i;
+	size_t		decode_len = srclen;
+
+	/*
+	 * RFC 4648 allows padding with '=' to make the length a multiple of 8.
+	 * Count and skip trailing padding characters.
+	 */
+	while (decode_len > 0 && data[decode_len - 1] == '=')
+		decode_len--;
+
+	for (i = 0; i < decode_len; i++)
+	{
+		unsigned char c = data[i];
+		int			val;
+
+		/* Skip whitespace */
+		if (c == ' ' || c == '\t' || c == '\n' || c == '\r')
+			continue;
+
+		/* Decode base32hex character (0-9, A-V, case-insensitive) */
+		if (c >= '0' && c <= '9')
+			val = c - '0';
+		else if (c >= 'A' && c <= 'V')
+			val = c - 'A' + 10;
+		else if (c >= 'a' && c <= 'v')
+			val = c - 'a' + 10;
+		else
+			ereport(ERROR,
+					(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
+					 errmsg("invalid symbol \"%.*s\" found while decoding base32hex sequence",
+							pg_mblen((const char *) &c), (const char *) &c)));
+
+		/* Add 5 bits to buffer */
+		bits_buffer = (bits_buffer << 5) | val;
+		bits_in_buffer += 5;
+
+		/* Extract 8-bit bytes when we have enough bits */
+		while (bits_in_buffer >= 8)
+		{
+			bits_in_buffer -= 8;
+			dst[output_pos++] = (unsigned char) (bits_buffer >> bits_in_buffer);
+			/* Clear the extracted bits */
+			bits_buffer &= ((1ULL << bits_in_buffer) - 1);
+		}
+	}
+
+	/* Verify no extra bits remain (padding bits should be zero) */
+	if (bits_in_buffer > 0 && (bits_buffer & ((1ULL << bits_in_buffer) - 1)) != 0)
+		ereport(ERROR,
+				(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
+				 errmsg("invalid base32hex end sequence"),
+				 errhint("Input data has non-zero padding bits.")));
+
+	return output_pos;
+}
+
 /*
  * Common
  */
@@ -850,6 +968,12 @@ static const struct
 			pg_base64url_enc_len, pg_base64url_dec_len, pg_base64url_encode, pg_base64url_decode
 		}
 	},
+	{
+		"base32hex",
+		{
+			base32hex_enc_len, base32hex_dec_len, base32hex_encode, base32hex_decode
+		}
+	},
 	{
 		"escape",
 		{
diff --git a/src/test/regress/expected/uuid.out b/src/test/regress/expected/uuid.out
index 24486084aaf..86d21a29093 100644
--- a/src/test/regress/expected/uuid.out
+++ b/src/test/regress/expected/uuid.out
@@ -321,5 +321,93 @@ SELECT '\x019a2f859ced7225b99d9c55044a2563'::bytea::uuid;
 SELECT '\x1234567890abcdef'::bytea::uuid; -- error
 ERROR:  invalid length for UUID
 DETAIL:  Expected 16 bytes, got 8.
+-- base32hex encoding via encode/decode
+SELECT encode('00000000-0000-0000-0000-000000000000'::uuid::bytea, 'base32hex');
+           encode           
+----------------------------
+ 00000000000000000000000000
+(1 row)
+
+SELECT encode('11111111-1111-1111-1111-111111111111'::uuid::bytea, 'base32hex');
+           encode           
+----------------------------
+ 248H248H248H248H248H248H24
+(1 row)
+
+SELECT encode('ffffffff-ffff-ffff-ffff-ffffffffffff'::uuid::bytea, 'base32hex');
+           encode           
+----------------------------
+ VVVVVVVVVVVVVVVVVVVVVVVVVS
+(1 row)
+
+SELECT encode('123e4567-e89b-12d3-a456-426614174000'::uuid::bytea, 'base32hex');
+           encode           
+----------------------------
+ 28V4APV8JC9D792M89J185Q000
+(1 row)
+
+-- test decode with base32hex
+SELECT decode('00000000000000000000000000', 'base32hex')::uuid;
+                decode                
+--------------------------------------
+ 00000000-0000-0000-0000-000000000000
+(1 row)
+
+SELECT decode('28V4APV8JC9D792M89J185Q000', 'base32hex')::uuid;
+                decode                
+--------------------------------------
+ 123e4567-e89b-12d3-a456-426614174000
+(1 row)
+
+-- test round-trip conversions
+SELECT decode(encode('00000000-0000-0000-0000-000000000000'::uuid::bytea, 'base32hex'), 'base32hex')::uuid;
+                decode                
+--------------------------------------
+ 00000000-0000-0000-0000-000000000000
+(1 row)
+
+SELECT encode(decode('28V4APV8JC9D792M89J185Q000', 'base32hex')::uuid::bytea, 'base32hex');
+           encode           
+----------------------------
+ 28V4APV8JC9D792M89J185Q000
+(1 row)
+
+SELECT decode(encode('123e4567-e89b-12d3-a456-426614174000'::uuid::bytea, 'base32hex'), 'base32hex')::uuid;
+                decode                
+--------------------------------------
+ 123e4567-e89b-12d3-a456-426614174000
+(1 row)
+
+-- test case insensitivity
+SELECT decode('28v4apv8jc9d792m89j185q000', 'base32hex')::uuid;
+                decode                
+--------------------------------------
+ 123e4567-e89b-12d3-a456-426614174000
+(1 row)
+
+SELECT decode('28V4APV8JC9D792M89J185Q000', 'base32hex')::uuid;
+                decode                
+--------------------------------------
+ 123e4567-e89b-12d3-a456-426614174000
+(1 row)
+
+-- test RFC 4648 padding (32 chars with 6 '=' signs)
+SELECT decode('28V4APV8JC9D792M89J185Q000======', 'base32hex')::uuid;
+                decode                
+--------------------------------------
+ 123e4567-e89b-12d3-a456-426614174000
+(1 row)
+
+SELECT decode('00000000000000000000000000======', 'base32hex')::uuid;
+                decode                
+--------------------------------------
+ 00000000-0000-0000-0000-000000000000
+(1 row)
+
+-- test error cases for base32hex
+SELECT decode('28V4APV8JC9D792M89J185Q00W', 'base32hex')::uuid;  -- invalid character W
+ERROR:  invalid symbol "W" found while decoding base32hex sequence
+SELECT decode('28V4APV8JC9D792M89J185Q00!', 'base32hex')::uuid;  -- invalid character !
+ERROR:  invalid symbol "!" found while decoding base32hex sequence
 -- clean up
 DROP TABLE guid1, guid2, guid3 CASCADE;
diff --git a/src/test/regress/sql/uuid.sql b/src/test/regress/sql/uuid.sql
index 63520d0b640..44e8fa8b243 100644
--- a/src/test/regress/sql/uuid.sql
+++ b/src/test/regress/sql/uuid.sql
@@ -151,5 +151,32 @@ SELECT '5b35380a-7143-4912-9b55-f322699c6770'::uuid::bytea;
 SELECT '\x019a2f859ced7225b99d9c55044a2563'::bytea::uuid;
 SELECT '\x1234567890abcdef'::bytea::uuid; -- error
 
+-- base32hex encoding via encode/decode
+SELECT encode('00000000-0000-0000-0000-000000000000'::uuid::bytea, 'base32hex');
+SELECT encode('11111111-1111-1111-1111-111111111111'::uuid::bytea, 'base32hex');
+SELECT encode('ffffffff-ffff-ffff-ffff-ffffffffffff'::uuid::bytea, 'base32hex');
+SELECT encode('123e4567-e89b-12d3-a456-426614174000'::uuid::bytea, 'base32hex');
+
+-- test decode with base32hex
+SELECT decode('00000000000000000000000000', 'base32hex')::uuid;
+SELECT decode('28V4APV8JC9D792M89J185Q000', 'base32hex')::uuid;
+
+-- test round-trip conversions
+SELECT decode(encode('00000000-0000-0000-0000-000000000000'::uuid::bytea, 'base32hex'), 'base32hex')::uuid;
+SELECT encode(decode('28V4APV8JC9D792M89J185Q000', 'base32hex')::uuid::bytea, 'base32hex');
+SELECT decode(encode('123e4567-e89b-12d3-a456-426614174000'::uuid::bytea, 'base32hex'), 'base32hex')::uuid;
+
+-- test case insensitivity
+SELECT decode('28v4apv8jc9d792m89j185q000', 'base32hex')::uuid;
+SELECT decode('28V4APV8JC9D792M89J185Q000', 'base32hex')::uuid;
+
+-- test RFC 4648 padding (32 chars with 6 '=' signs)
+SELECT decode('28V4APV8JC9D792M89J185Q000======', 'base32hex')::uuid;
+SELECT decode('00000000000000000000000000======', 'base32hex')::uuid;
+
+-- test error cases for base32hex
+SELECT decode('28V4APV8JC9D792M89J185Q00W', 'base32hex')::uuid;  -- invalid character W
+SELECT decode('28V4APV8JC9D792M89J185Q00!', 'base32hex')::uuid;  -- invalid character !
+
 -- clean up
 DROP TABLE guid1, guid2, guid3 CASCADE;
-- 
2.39.5 (Apple Git-154)

