On 2019-Oct-07, Anders Åstrand wrote:
> Attached is a patch for adding uri as an encoding option for
> encode/decode. It uses what's called "percent-encoding" in rfc3986
> (https://tools.ietf.org/html/rfc3986#section-2.1).
Thanks. Seems useful. I made a few cosmetic tweaks and it looks almost
ready to me; however, documentation is missing. I added a stub; can you
please complete that?
To answer Arthur Zakirov's question: yes, the standard recommends
("should") to use uppercase characters:
: For consistency, URI producers and
: normalizers should use uppercase hexadecimal digits for all percent-
: encodings.
Thanks,
--
Álvaro Herrera https://www.2ndQuadrant.com/
PostgreSQL Development, 24x7 Support, Remote DBA, Training & Services
>From 44475f709762ba1a2a881d20345cc6a4cb086f01 Mon Sep 17 00:00:00 2001
From: Alvaro Herrera <[email protected]>
Date: Thu, 20 Feb 2020 18:46:15 -0300
Subject: [PATCH v2] URI encode
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
Author: Anders Åstrand
Discussion: https://postgr.es/m/APwPebtwJnjjt=euusml1zz6w3jvna1cvjezhbouccytjc9...@mail.gmail.com
---
doc/src/sgml/func.sgml | 16 +++-
src/backend/utils/adt/encode.c | 129 ++++++++++++++++++++++++++
src/test/regress/expected/strings.out | 21 +++++
src/test/regress/sql/strings.sql | 7 ++
4 files changed, 172 insertions(+), 1 deletion(-)
diff --git a/doc/src/sgml/func.sgml b/doc/src/sgml/func.sgml
index ceda48e0fc..c60ad4f4e2 100644
--- a/doc/src/sgml/func.sgml
+++ b/doc/src/sgml/func.sgml
@@ -3180,7 +3180,8 @@ SELECT format('Testing %3$s, %2$s, %s', 'one', 'two', 'three');
<parameter>format</parameter> values are:
<link linkend="encode-format-base64"><literal>base64</literal></link>,
<link linkend="encode-format-escape"><literal>escape</literal></link>,
- <link linkend="encode-format-hex"><literal>hex</literal></link>
+ <link linkend="encode-format-hex"><literal>hex</literal></link>,
+ <link linkend="encode-format-uri"><literal>uri</literal></link>
</entry>
<entry><literal>encode('123\000\001', 'base64')</literal></entry>
<entry><literal>MTIzAAE=</literal></entry>
@@ -3274,6 +3275,19 @@ SELECT format('Testing %3$s, %2$s, %s', 'one', 'two', 'three');
</para>
</listitem>
</varlistentry>
+
+ <varlistentry id="encode-format-uri">
+ <term>uri
+ <indexterm>
+ <primary>uri format</primary>
+ </indexterm></term>
+ <listitem>
+ <para>
+ The <literal>uri</literal> format represents ...
+ </para>
+ </listitem>
+ </varlistentry>
+
</variablelist>
</para>
diff --git a/src/backend/utils/adt/encode.c b/src/backend/utils/adt/encode.c
index b8d9ec7e00..81d4ea8400 100644
--- a/src/backend/utils/adt/encode.c
+++ b/src/backend/utils/adt/encode.c
@@ -110,6 +110,7 @@ binary_decode(PG_FUNCTION_ARGS)
*/
static const char hextbl[] = "0123456789abcdef";
+static const char hextbl_upper[] = "0123456789ABCDEF";
static const int8 hexlookup[128] = {
-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
@@ -512,6 +513,128 @@ esc_dec_len(const char *src, unsigned srclen)
return len;
}
+/*
+ * URI percent encoding
+ *
+ * Percent encodes all byte values except the unreserved ASCII characters as
+ * per RFC3986.
+ */
+
+static unsigned
+uri_encode(const char *src, unsigned srclen, char *dst)
+{
+ char *d = dst;
+
+ for (const char *s = src; s < src + srclen; s++)
+ {
+ /*
+ * RFC3986:
+ *
+ * unreserved = ALPHA / DIGIT / "-" / "." / "_" / "~"
+ */
+ if ((*s >= 'A' && *s <= 'Z') ||
+ (*s >= 'a' && *s <= 'z') ||
+ (*s >= '0' && *s <= '9') ||
+ *s == '-' ||
+ *s == '.' ||
+ *s == '_' ||
+ *s == '~')
+ {
+ *d++ = *s;
+ }
+ else
+ {
+ *d++ = '%';
+ *d++ = hextbl_upper[(*s >> 4) & 0xF];
+ *d++ = hextbl_upper[*s & 0xF];
+ }
+ }
+ return d - dst;
+}
+
+static unsigned
+uri_decode(const char *src, unsigned srclen, char *dst)
+{
+ const char *s = src;
+ const char *srcend = src + srclen;
+ char *d = dst;
+ char val;
+
+ while (s < srcend)
+ {
+ if (*s == '%')
+ {
+ /*
+ * Verify we have the needed bytes. This doesn't happen, since
+ * uri_dec_len already takes care of validation.
+ */
+ if (s > srcend - 3)
+ elog(ERROR, "invalid uri percent encoding");
+
+ /* Skip '%' */
+ s++;
+
+ val = get_hex(*s++) << 4;
+ val += get_hex(*s++);
+ *d++ = val;
+ }
+ else
+ *d++ = *s++;
+ }
+ return d - dst;
+}
+
+static unsigned
+uri_enc_len(const char *src, unsigned srclen)
+{
+ int len = 0;
+
+ for (const char *s = src; s < src + srclen; s++)
+ {
+ if ((*s >= 'A' && *s <= 'Z') ||
+ (*s >= 'a' && *s <= 'z') ||
+ (*s >= '0' && *s <= '9') ||
+ *s == '-' ||
+ *s == '_' ||
+ *s == '.' ||
+ *s == '~')
+ {
+ len++;
+ }
+ else
+ len += 3;
+ }
+ return len;
+}
+
+static unsigned
+uri_dec_len(const char *src, unsigned srclen)
+{
+ const char *s = src;
+ const char *srcend = src + srclen;
+ int len = 0;
+
+ while (s < srcend)
+ {
+ if (*s == '%')
+ {
+ if (s > srcend - 3)
+ ereport(ERROR,
+ (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
+ errmsg("invalid uri percent encoding"),
+ errhint("Input data ends prematurely.")));
+ s++;
+ get_hex(*s++);
+ get_hex(*s++);
+ }
+ else
+ s++;
+ len++;
+ }
+
+ return len;
+}
+
/*
* Common
*/
@@ -541,6 +664,12 @@ static const struct
esc_enc_len, esc_dec_len, esc_encode, esc_decode
}
},
+ {
+ "uri",
+ {
+ uri_enc_len, uri_dec_len, uri_encode, uri_decode
+ }
+ },
{
NULL,
{
diff --git a/src/test/regress/expected/strings.out b/src/test/regress/expected/strings.out
index 60cb86193c..a79ef6ac10 100644
--- a/src/test/regress/expected/strings.out
+++ b/src/test/regress/expected/strings.out
@@ -1892,3 +1892,24 @@ SELECT encode(overlay(E'Th\\000omas'::bytea placing E'\\002\\003'::bytea from 5
Th\000o\x02\x03
(1 row)
+SET bytea_output TO hex;
+SELECT encode(E'en\\300\\336d'::bytea, 'uri');
+ encode
+-----------
+ en%C0%DEd
+(1 row)
+
+SELECT decode('%De%c0%DEd', 'uri');
+ decode
+------------
+ \xdec0de64
+(1 row)
+
+SELECT decode('error%Ex', 'uri');
+ERROR: invalid hexadecimal digit: "x"
+SELECT decode('error%E', 'uri');
+ERROR: invalid uri percent encoding
+HINT: Input data ends prematurely.
+SELECT decode('error%', 'uri');
+ERROR: invalid uri percent encoding
+HINT: Input data ends prematurely.
diff --git a/src/test/regress/sql/strings.sql b/src/test/regress/sql/strings.sql
index c5cd15142a..8a7b103681 100644
--- a/src/test/regress/sql/strings.sql
+++ b/src/test/regress/sql/strings.sql
@@ -648,3 +648,10 @@ SELECT btrim(E'\\000trim\\000'::bytea, ''::bytea);
SELECT encode(overlay(E'Th\\000omas'::bytea placing E'Th\\001omas'::bytea from 2),'escape');
SELECT encode(overlay(E'Th\\000omas'::bytea placing E'\\002\\003'::bytea from 8),'escape');
SELECT encode(overlay(E'Th\\000omas'::bytea placing E'\\002\\003'::bytea from 5 for 3),'escape');
+
+SET bytea_output TO hex;
+SELECT encode(E'en\\300\\336d'::bytea, 'uri');
+SELECT decode('%De%c0%DEd', 'uri');
+SELECT decode('error%Ex', 'uri');
+SELECT decode('error%E', 'uri');
+SELECT decode('error%', 'uri');
--
2.20.1