Hi, > Also, a small nitpick is that we can use uint32 instead of uint64 for > 'bits_buffer'. I've attached the updated patch as well as the > difference from the previous version.
Then I suggest using uint32 for the bits_buffer variable in base32hex_encode() too. Also we should use 1U instead of 1ULL with uint32. -- Best regards, Aleksander Alekseev
From 02758297caf1022c3f07838b322113238c2eaa90 Mon Sep 17 00:00:00 2001 From: Andrey Borodin <[email protected]> Date: Wed, 29 Oct 2025 15:53:12 +0400 Subject: [PATCH v9] Add base32hex support to encode() and decode() functions. MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This adds support for base32hex encoding and decoding, as defined in RFC 4648 Section 7. Unlike standard base32, base32hex uses the extended hex alphabet (0-9, A-V) which preserves the lexicographical order of the encoded data. This is particularly useful for representing UUIDv7 values in a compact string format while maintaining their time-ordered sort property. The encode() function produces output padded with '=', while decode() accepts both padded and unpadded input. Following the behavior of other encoding types, decoding is case-insensitive. Suggested-by: Sergey Prokhorenko <[email protected]> Author: Andrey Borodin <[email protected]> Co-authored-by: Aleksander Alekseev <[email protected]> Reviewed-by: Masahiko Sawada <[email protected]> Reviewed-by: Илья Чердаков <[email protected]> Discussion: https://postgr.es/m/CAJ7c6TOramr1UTLcyB128LWMqita1Y7%3Darq3KHaU%3Dqikf5yKOQ%40mail.gmail.com --- doc/src/sgml/func/func-binarystring.sgml | 27 ++++ src/backend/utils/adt/encode.c | 153 ++++++++++++++++++++++- src/test/regress/expected/strings.out | 119 +++++++++++++++++- src/test/regress/expected/uuid.out | 16 +++ src/test/regress/sql/strings.sql | 31 ++++- src/test/regress/sql/uuid.sql | 9 ++ 6 files changed, 347 insertions(+), 8 deletions(-) diff --git a/doc/src/sgml/func/func-binarystring.sgml b/doc/src/sgml/func/func-binarystring.sgml index b256381e01f..3f77f6d20b0 100644 --- a/doc/src/sgml/func/func-binarystring.sgml +++ b/doc/src/sgml/func/func-binarystring.sgml @@ -729,6 +729,7 @@ <parameter>format</parameter> values are: <link linkend="encode-format-base64"><literal>base64</literal></link>, <link linkend="encode-format-base64url"><literal>base64url</literal></link>, + <link linkend="encode-format-base32hex"><literal>base32hex</literal></link>, <link linkend="encode-format-escape"><literal>escape</literal></link>, <link linkend="encode-format-hex"><literal>hex</literal></link>. </para> @@ -804,6 +805,32 @@ </listitem> </varlistentry> + <varlistentry id="encode-format-base32hex"> + <term>base32hex + <indexterm> + <primary>base32hex format</primary> + </indexterm></term> + <listitem> + <para> + The <literal>base32hex</literal> format is that of + <ulink url="https://datatracker.ietf.org/doc/html/rfc4648#section-7"> + RFC 4648 Section 7</ulink>. It uses the extended hex alphabet + (<literal>0</literal>-<literal>9</literal> and + <literal>A</literal>-<literal>V</literal>) which preserves the lexicographical + sort order of the encoded data. The <function>encode</function> function + produces output padded with <literal>'='</literal>, while <function>decode</function> + accepts both padded and unpadded input. Decoding is case-insensitive and ignores + whitespace characters. + </para> + <para> + This format is useful for encoding UUIDs in a compact, sortable format: + <literal>substring(encode(uuid_value::bytea, 'base32hex') FROM 1 FOR 26)</literal> + produces a 26-character string compared to the standard 36-character + UUID representation. + </para> + </listitem> + </varlistentry> + <varlistentry id="encode-format-escape"> <term>escape <indexterm> diff --git a/src/backend/utils/adt/encode.c b/src/backend/utils/adt/encode.c index f5f835e944a..334fa080b95 100644 --- a/src/backend/utils/adt/encode.c +++ b/src/backend/utils/adt/encode.c @@ -65,8 +65,8 @@ binary_encode(PG_FUNCTION_ARGS) ereport(ERROR, (errcode(ERRCODE_INVALID_PARAMETER_VALUE), errmsg("unrecognized encoding: \"%s\"", namebuf), - errhint("Valid encodings are \"%s\", \"%s\", \"%s\", and \"%s\".", - "base64", "base64url", "escape", "hex"))); + errhint("Valid encodings are \"%s\", \"%s\", \"%s\", \"%s\", and \"%s\".", + "base64", "base64url", "base32hex", "escape", "hex"))); dataptr = VARDATA_ANY(data); datalen = VARSIZE_ANY_EXHDR(data); @@ -115,8 +115,8 @@ binary_decode(PG_FUNCTION_ARGS) ereport(ERROR, (errcode(ERRCODE_INVALID_PARAMETER_VALUE), errmsg("unrecognized encoding: \"%s\"", namebuf), - errhint("Valid encodings are \"%s\", \"%s\", \"%s\", and \"%s\".", - "base64", "base64url", "escape", "hex"))); + errhint("Valid encodings are \"%s\", \"%s\", \"%s\", \"%s\", and \"%s\".", + "base64", "base64url", "base32hex", "escape", "hex"))); dataptr = VARDATA_ANY(data); datalen = VARSIZE_ANY_EXHDR(data); @@ -825,6 +825,145 @@ esc_dec_len(const char *src, size_t srclen) return len; } +/* + * BASE32HEX + */ + +static const char base32hex_table[] = "0123456789ABCDEFGHIJKLMNOPQRSTUV"; + +static uint64 +base32hex_enc_len(const char *src, size_t srclen) +{ + /* 5 bytes encode to 8 characters, round up to multiple of 8 for padding */ + return ((uint64) srclen + 4) / 5 * 8; +} + +static uint64 +base32hex_dec_len(const char *src, size_t srclen) +{ + /* Decode length is (srclen * 5) / 8, but we may have padding */ + return ((uint64) srclen * 5) / 8; +} + +static uint64 +base32hex_encode(const char *src, size_t srclen, char *dst) +{ + const unsigned char *data = (const unsigned char *) src; + uint32 bits_buffer = 0; + int bits_in_buffer = 0; + uint64 output_pos = 0; + size_t i; + + for (i = 0; i < srclen; i++) + { + /* Add 8 bits to the buffer */ + bits_buffer = (bits_buffer << 8) | data[i]; + bits_in_buffer += 8; + + /* Extract 5-bit chunks while we have enough bits */ + while (bits_in_buffer >= 5) + { + bits_in_buffer -= 5; + /* Extract top 5 bits */ + dst[output_pos++] = base32hex_table[(bits_buffer >> bits_in_buffer) & 0x1F]; + /* Clear the extracted bits by masking */ + bits_buffer &= ((1U << bits_in_buffer) - 1); + } + } + + /* Handle remaining bits (if any) */ + if (bits_in_buffer > 0) + dst[output_pos++] = base32hex_table[(bits_buffer << (5 - bits_in_buffer)) & 0x1F]; + + /* Add padding to make length a multiple of 8 (per RFC 4648) */ + while (output_pos % 8 != 0) + dst[output_pos++] = '='; + + return output_pos; +} + +static uint64 +base32hex_decode(const char *src, size_t srclen, char *dst) +{ + const char *srcend = src + srclen, + *s = src; + uint32 bits_buffer = 0; + int bits_in_buffer = 0; + uint64 output_pos = 0; + int pos = 0; /* position within 8-character group (0-7) */ + bool end = false; /* have we seen padding? */ + + while (s < srcend) + { + char c = *s++; + int val; + + /* Skip whitespace */ + if (c == ' ' || c == '\t' || c == '\n' || c == '\r') + continue; + + if (c == '=') + { + /* + * The first padding is only valid at positions 2, 4, 5, or 7 + * within an 8-character group (corresponding to 1, 2, 3, or 4 + * input bytes). We only check the position for the first '=' + * character. + */ + if (!end) + { + if (pos != 2 && pos != 4 && pos != 5 && pos != 7) + ereport(ERROR, + (errcode(ERRCODE_INVALID_PARAMETER_VALUE), + errmsg("unexpected \"=\" while decoding base32hex sequence"))); + end = true; + } + pos++; + continue; + } + + /* No data characters allowed after padding */ + if (end) + ereport(ERROR, + (errcode(ERRCODE_INVALID_PARAMETER_VALUE), + errmsg("invalid symbol \"%.*s\" found while decoding base32hex sequence", + pg_mblen_range(s - 1, srcend), s - 1))); + + /* Decode base32hex character (0-9, A-V, case-insensitive) */ + if (c >= '0' && c <= '9') + val = c - '0'; + else if (c >= 'A' && c <= 'V') + val = c - 'A' + 10; + else if (c >= 'a' && c <= 'v') + val = c - 'a' + 10; + else + ereport(ERROR, + (errcode(ERRCODE_INVALID_PARAMETER_VALUE), + errmsg("invalid symbol \"%.*s\" found while decoding base32hex sequence", + pg_mblen_range(s - 1, srcend), s - 1))); + + /* Add 5 bits to buffer */ + bits_buffer = (bits_buffer << 5) | val; + bits_in_buffer += 5; + pos++; + + /* Extract 8-bit bytes when we have enough bits */ + while (bits_in_buffer >= 8) + { + bits_in_buffer -= 8; + dst[output_pos++] = (unsigned char) (bits_buffer >> bits_in_buffer); + /* Clear the extracted bits */ + bits_buffer &= ((1U << bits_in_buffer) - 1); + } + + /* Reset position after each complete 8-character group */ + if (pos == 8) + pos = 0; + } + + return output_pos; +} + /* * Common */ @@ -854,6 +993,12 @@ static const struct pg_base64url_enc_len, pg_base64url_dec_len, pg_base64url_encode, pg_base64url_decode } }, + { + "base32hex", + { + base32hex_enc_len, base32hex_dec_len, base32hex_encode, base32hex_decode + } + }, { "escape", { diff --git a/src/test/regress/expected/strings.out b/src/test/regress/expected/strings.out index f38688b5c37..0166a57b0d4 100644 --- a/src/test/regress/expected/strings.out +++ b/src/test/regress/expected/strings.out @@ -2600,14 +2600,127 @@ SELECT decode(encode('\x1234567890abcdef00', 'escape'), 'escape'); -- report an error with a hint listing valid encodings when an invalid encoding is specified SELECT encode('\x01'::bytea, 'invalid'); -- error ERROR: unrecognized encoding: "invalid" -HINT: Valid encodings are "base64", "base64url", "escape", and "hex". +HINT: Valid encodings are "base64", "base64url", "base32hex", "escape", and "hex". SELECT decode('00', 'invalid'); -- error ERROR: unrecognized encoding: "invalid" -HINT: Valid encodings are "base64", "base64url", "escape", and "hex". +HINT: Valid encodings are "base64", "base64url", "base32hex", "escape", and "hex". -- --- base64url encoding/decoding +-- base32hex encoding/decoding -- SET bytea_output TO hex; +SELECT encode('', 'base32hex'); -- '' + encode +-------- + +(1 row) + +SELECT encode('\x11', 'base32hex'); -- '24======' + encode +---------- + 24====== +(1 row) + +SELECT encode('\x1122', 'base32hex'); -- '24H0====' + encode +---------- + 24H0==== +(1 row) + +SELECT encode('\x112233', 'base32hex'); -- '24H36===' + encode +---------- + 24H36=== +(1 row) + +SELECT encode('\x11223344', 'base32hex'); -- '24H36H0=' + encode +---------- + 24H36H0= +(1 row) + +SELECT encode('\x1122334455', 'base32hex'); -- '24H36H2L' + encode +---------- + 24H36H2L +(1 row) + +SELECT encode('\x112233445566', 'base32hex'); -- '24H36H2LCO======' + encode +------------------ + 24H36H2LCO====== +(1 row) + +SELECT decode('', 'base32hex'); -- '' + decode +-------- + \x +(1 row) + +SELECT decode('24======', 'base32hex'); -- \x11 + decode +-------- + \x11 +(1 row) + +SELECT decode('24H0====', 'base32hex'); -- \x1122 + decode +-------- + \x1122 +(1 row) + +SELECT decode('24H36===', 'base32hex'); -- \x112233 + decode +---------- + \x112233 +(1 row) + +SELECT decode('24H36H0=', 'base32hex'); -- \x11223344 + decode +------------ + \x11223344 +(1 row) + +SELECT decode('24H36H2L', 'base32hex'); -- \x1122334455 + decode +-------------- + \x1122334455 +(1 row) + +SELECT decode('24H36H2LCO======', 'base32hex'); -- \x112233445566 + decode +---------------- + \x112233445566 +(1 row) + +SELECT decode('24', 'base32hex'); -- OK, padding `=` are optional + decode +-------- + \x11 +(1 row) + +SELECT decode('24h36h2lco', 'base32hex'); -- OK, the encoding is case-insensitive + decode +---------------- + \x112233445566 +(1 row) + +SELECT decode('=', 'base32hex'); -- error +ERROR: unexpected "=" while decoding base32hex sequence +SELECT decode('W', 'base32hex'); -- error +ERROR: invalid symbol "W" found while decoding base32hex sequence +SELECT decode('24H36H0=24', 'base32hex'); -- error +ERROR: invalid symbol "2" found while decoding base32hex sequence +SELECT decode('11=', 'base32hex'); -- OK, non-zero padding bits are accepted (consistent with base64) + decode +-------- + \x08 +(1 row) + +SELECT decode('あ', 'base32hex'); -- error +ERROR: invalid symbol "あ" found while decoding base32hex sequence +-- +-- base64url encoding/decoding +-- -- Simple encoding/decoding SELECT encode('\x69b73eff', 'base64url'); -- abc-_w encode diff --git a/src/test/regress/expected/uuid.out b/src/test/regress/expected/uuid.out index d157ef7d0b3..d2a45a0f07c 100644 --- a/src/test/regress/expected/uuid.out +++ b/src/test/regress/expected/uuid.out @@ -327,5 +327,21 @@ SELECT v = v::bytea::uuid as matched FROM gen_random_uuid() v; t (1 row) +-- make sure base32hex encoding works with UUIDs and preserves ordering +SELECT orig, encode(orig::bytea, 'base32hex') AS enc +FROM unnest(ARRAY[ + '123e4567-e89b-12d3-a456-426614174000', + '00000000-0000-0000-0000-000000000000', + '11111111-1111-1111-1111-111111111111', + 'ffffffff-ffff-ffff-ffff-ffffffffffff']::uuid[] +) AS orig ORDER BY enc; + orig | enc +--------------------------------------+---------------------------------- + 00000000-0000-0000-0000-000000000000 | 00000000000000000000000000====== + 11111111-1111-1111-1111-111111111111 | 248H248H248H248H248H248H24====== + 123e4567-e89b-12d3-a456-426614174000 | 28V4APV8JC9D792M89J185Q000====== + ffffffff-ffff-ffff-ffff-ffffffffffff | VVVVVVVVVVVVVVVVVVVVVVVVVS====== +(4 rows) + -- clean up DROP TABLE guid1, guid2, guid3 CASCADE; diff --git a/src/test/regress/sql/strings.sql b/src/test/regress/sql/strings.sql index d8a09737668..13fcfe21241 100644 --- a/src/test/regress/sql/strings.sql +++ b/src/test/regress/sql/strings.sql @@ -835,10 +835,39 @@ SELECT encode('\x01'::bytea, 'invalid'); -- error SELECT decode('00', 'invalid'); -- error -- --- base64url encoding/decoding +-- base32hex encoding/decoding -- SET bytea_output TO hex; +SELECT encode('', 'base32hex'); -- '' +SELECT encode('\x11', 'base32hex'); -- '24======' +SELECT encode('\x1122', 'base32hex'); -- '24H0====' +SELECT encode('\x112233', 'base32hex'); -- '24H36===' +SELECT encode('\x11223344', 'base32hex'); -- '24H36H0=' +SELECT encode('\x1122334455', 'base32hex'); -- '24H36H2L' +SELECT encode('\x112233445566', 'base32hex'); -- '24H36H2LCO======' + +SELECT decode('', 'base32hex'); -- '' +SELECT decode('24======', 'base32hex'); -- \x11 +SELECT decode('24H0====', 'base32hex'); -- \x1122 +SELECT decode('24H36===', 'base32hex'); -- \x112233 +SELECT decode('24H36H0=', 'base32hex'); -- \x11223344 +SELECT decode('24H36H2L', 'base32hex'); -- \x1122334455 +SELECT decode('24H36H2LCO======', 'base32hex'); -- \x112233445566 + +SELECT decode('24', 'base32hex'); -- OK, padding `=` are optional +SELECT decode('24h36h2lco', 'base32hex'); -- OK, the encoding is case-insensitive +SELECT decode('=', 'base32hex'); -- error +SELECT decode('W', 'base32hex'); -- error +SELECT decode('24H36H0=24', 'base32hex'); -- error +SELECT decode('11=', 'base32hex'); -- OK, non-zero padding bits are accepted (consistent with base64) +SELECT decode('あ', 'base32hex'); -- error + + +-- +-- base64url encoding/decoding +-- + -- Simple encoding/decoding SELECT encode('\x69b73eff', 'base64url'); -- abc-_w SELECT decode('abc-_w', 'base64url'); -- \x69b73eff diff --git a/src/test/regress/sql/uuid.sql b/src/test/regress/sql/uuid.sql index f512f4dea1d..ee14802630a 100644 --- a/src/test/regress/sql/uuid.sql +++ b/src/test/regress/sql/uuid.sql @@ -152,5 +152,14 @@ SELECT '\x019a2f859ced7225b99d9c55044a2563'::bytea::uuid; SELECT '\x1234567890abcdef'::bytea::uuid; -- error SELECT v = v::bytea::uuid as matched FROM gen_random_uuid() v; +-- make sure base32hex encoding works with UUIDs and preserves ordering +SELECT orig, encode(orig::bytea, 'base32hex') AS enc +FROM unnest(ARRAY[ + '123e4567-e89b-12d3-a456-426614174000', + '00000000-0000-0000-0000-000000000000', + '11111111-1111-1111-1111-111111111111', + 'ffffffff-ffff-ffff-ffff-ffffffffffff']::uuid[] +) AS orig ORDER BY enc; + -- clean up DROP TABLE guid1, guid2, guid3 CASCADE; -- 2.43.0
