rebase
Jim
From 2dd06194c24c89950e0e4e334469bc9a24543092 Mon Sep 17 00:00:00 2001
From: Jim Jones <[email protected]>
Date: Thu, 18 Jun 2026 08:08:48 +0200
Subject: [PATCH v26] Add xmlcanonicalize function
This adds xmlcanonicalize(doc xml, keep_comments boolean DEFAULT true),
which transforms a well-formed XML document into its canonical form
according to the W3C Canonical XML 1.1 specification. The canonical
form provides a standardized, byte-for-byte reproducible representation
useful for document comparison and digital signatures.
Author: Jim Jones <[email protected]>
Reviewed-by: Andrew Dunstan <[email protected]>
Reviewed-by: Tom Lane <[email protected]>
Reviewed-by: Pavel Stehule <[email protected]>
Reviewed-by: vignesh C <[email protected]>
Reviewed-by: Oliver Ford <[email protected]>
Reviewed-by: newtglobal postgresql_contributors <[email protected]>
Reviewed-by: Chapman Flack <[email protected]>
Discussion: https://www.postgresql.org/message-id/flat/67fa8560-8d61-5d06-8178-fc9c7684db90%40uni-muenster.de
---
doc/src/sgml/func/func-xml.sgml | 44 ++++++++++
src/backend/utils/adt/xml.c | 77 +++++++++++++++++
src/include/catalog/pg_proc.dat | 4 +
src/test/regress/expected/xml.out | 132 ++++++++++++++++++++++++++++++
src/test/regress/sql/xml.sql | 53 ++++++++++++
5 files changed, 310 insertions(+)
diff --git a/doc/src/sgml/func/func-xml.sgml b/doc/src/sgml/func/func-xml.sgml
index 511bc90852a..cdf053ae1ed 100644
--- a/doc/src/sgml/func/func-xml.sgml
+++ b/doc/src/sgml/func/func-xml.sgml
@@ -61,6 +61,50 @@ SELECT xmltext('< foo & bar >');
</para>
</sect3>
+ <sect3 id="functions-producing-xml-xmlcanonicalize">
+ <title><literal>xmlcanonicalize</literal></title>
+
+ <indexterm>
+ <primary>xmlcanonicalize</primary>
+ </indexterm>
+
+<synopsis>
+<function>xmlcanonicalize</function> ( <parameter>doc</parameter> <type>xml</type> [, <parameter>keep_comments</parameter> <type>boolean</type> DEFAULT <literal>true</literal>] ) <returnvalue>text</returnvalue>
+</synopsis>
+
+ <para>
+ This function transforms a given XML document into its <ulink url="https://www.w3.org/TR/xml-c14n11/#Terminology">canonical form</ulink>,
+ as defined by the <ulink url="https://www.w3.org/TR/xml-c14n11/">W3C Canonical XML 1.1 Specification</ulink>, which standardizes the document's
+ structure and syntax to facilitate comparison and digital signatures.
+ The <parameter>keep_comments</parameter> parameter controls whether XML comments from the input document are preserved or discarded.
+ If omitted, it defaults to <literal>true</literal>.
+ </para>
+
+ <para>
+ The canonical form is always encoded in UTF-8, as required by the
+ W3C specification. In databases that do not use UTF-8 encoding,
+ documents containing characters that cannot be represented in the
+ database encoding will produce an encoding error.
+ </para>
+
+ <para>
+ Example:
+<screen><![CDATA[
+SELECT xmlcanonicalize('<foo><!-- a comment --><bar c="3" b="2" a="1">42</bar><empty/></foo>'::xml);
+ xmlcanonicalize
+-----------------------------------------------------------------------------
+ <foo><!-- a comment --><bar a="1" b="2" c="3">42</bar><empty></empty></foo>
+(1 row)
+
+SELECT xmlcanonicalize('<foo><!-- a comment --><bar c="3" b="2" a="1">42</bar><empty/></foo>'::xml, false);
+ xmlcanonicalize
+-----------------------------------------------------------
+ <foo><bar a="1" b="2" c="3">42</bar><empty></empty></foo>
+(1 row)
+]]></screen>
+ </para>
+ </sect3>
+
<sect3 id="functions-producing-xml-xmlcomment">
<title><literal>xmlcomment</literal></title>
diff --git a/src/backend/utils/adt/xml.c b/src/backend/utils/adt/xml.c
index 0953ad2becb..33ae6c2837e 100644
--- a/src/backend/utils/adt/xml.c
+++ b/src/backend/utils/adt/xml.c
@@ -46,6 +46,7 @@
#include "postgres.h"
#ifdef USE_LIBXML
+#include <libxml/c14n.h>
#include <libxml/chvalid.h>
#include <libxml/entities.h>
#include <libxml/parser.h>
@@ -566,6 +567,82 @@ xmltext(PG_FUNCTION_ARGS)
#endif /* not USE_LIBXML */
}
+/*
+ * Canonicalizes the given XML document according to the W3C Canonical XML 1.1
+ * specification, using libxml2's xmlC14NDocDumpMemory().
+ *
+ * The input XML must be a well-formed document (not a fragment). The
+ * canonical form is deterministic and useful for digital signatures and
+ * comparing logically equivalent XML.
+ *
+ * The second argument determines whether comments are preserved
+ * (true) or omitted (false) in the canonicalized output.
+ */
+Datum
+xmlcanonicalize(PG_FUNCTION_ARGS)
+{
+#ifdef USE_LIBXML
+ xmltype *arg = PG_GETARG_XML_P(0);
+ bool keep_comments = PG_GETARG_BOOL(1);
+ text *result;
+ xmlChar *volatile xmlbuf = NULL;
+ int nbytes = 0;
+ volatile xmlDocPtr doc = NULL;
+ PgXmlErrorContext *xmlerrcxt;
+
+ /* Set up XML error context for proper libxml2 error integration */
+ xmlerrcxt = pg_xml_init(PG_XML_STRICTNESS_ALL);
+
+ PG_TRY();
+ {
+ char *converted;
+
+ /* Parse the input as a full XML document */
+ doc = xml_parse(arg, XMLOPTION_DOCUMENT, true,
+ GetDatabaseEncoding(), NULL, NULL, NULL);
+
+ /* Canonicalize the entire document using C14N 1.1 */
+ nbytes = xmlC14NDocDumpMemory(doc, NULL, XML_C14N_1_1,
+ NULL, keep_comments,
+ (xmlChar **) &xmlbuf);
+
+ if (nbytes < 0 || xmlbuf == NULL || xmlerrcxt->err_occurred)
+ xml_ereport(xmlerrcxt, ERROR, ERRCODE_INTERNAL_ERROR,
+ "could not canonicalize XML document");
+
+ /*
+ * C14N always produces UTF-8 output regardless of the database
+ * encoding. Convert to the server encoding so the result is a
+ * valid text value.
+ */
+ converted = pg_any_to_server((char *) xmlbuf, nbytes, PG_UTF8);
+
+ result = cstring_to_text(converted);
+ if (converted != (char *) xmlbuf)
+ pfree(converted);
+ }
+ PG_CATCH();
+ {
+ if (doc)
+ xmlFreeDoc((xmlDocPtr) doc);
+ if (xmlbuf)
+ xmlFree((xmlChar *) xmlbuf);
+
+ pg_xml_done(xmlerrcxt, true);
+ PG_RE_THROW();
+ }
+ PG_END_TRY();
+
+ xmlFreeDoc((xmlDocPtr) doc);
+ xmlFree((xmlChar *) xmlbuf);
+ pg_xml_done(xmlerrcxt, false);
+
+ PG_RETURN_TEXT_P(result);
+#else
+ NO_XML_SUPPORT();
+ return 0;
+#endif /* not USE_LIBXML */
+}
/*
* TODO: xmlconcat needs to merge the notations and unparsed entities
diff --git a/src/include/catalog/pg_proc.dat b/src/include/catalog/pg_proc.dat
index be157a5fbe9..7915d49f169 100644
--- a/src/include/catalog/pg_proc.dat
+++ b/src/include/catalog/pg_proc.dat
@@ -9283,6 +9283,10 @@
{ oid => '3813', descr => 'generate XML text node',
proname => 'xmltext', prorettype => 'xml', proargtypes => 'text',
prosrc => 'xmltext' },
+{ oid => '3814', descr => 'generate the canonical form of an XML document',
+ proname => 'xmlcanonicalize', prorettype => 'text', proargtypes => 'xml bool',
+ proargnames => '{doc,keep_comments}', proargdefaults => '{true}',
+ prosrc => 'xmlcanonicalize' },
{ oid => '2923', descr => 'map table contents to XML',
proname => 'table_to_xml', procost => '100', provolatile => 's',
diff --git a/src/test/regress/expected/xml.out b/src/test/regress/expected/xml.out
index fb3e0ec41b2..cd09ee407c7 100644
--- a/src/test/regress/expected/xml.out
+++ b/src/test/regress/expected/xml.out
@@ -1891,3 +1891,135 @@ SELECT xmltext('x'|| '<P>73</P>'::xml || .42 || true || 'j'::char);
x<P>73</P>0.42truej
(1 row)
+-- xmlcanonicalize
+CREATE TABLE xmlcanonicalize_test (doc xml);
+INSERT INTO xmlcanonicalize_test VALUES
+ ('<?xml version="1.0" encoding="ISO-8859-1"?>
+ <!DOCTYPE doc SYSTEM "doc.dtd" [
+ <!ENTITY val "42">
+ <!ATTLIST xyz attr CDATA "default">
+ ]>
+
+ <!-- attributes and namespaces will be sorted -->
+ <foo a:attr="out" b:attr="sorted" attr2="all" attr="I am"
+ xmlns:b="http://www.ietf.org"
+ xmlns:a="http://www.w3.org"
+ xmlns="http://example.org">
+
+ <!-- Normalization of whitespace in start and end tags -->
+ <!-- Elimination of superfluous namespace declarations, as already declared in <foo> -->
+ <bar xmlns="" xmlns:a="http://www.w3.org" >&val;</bar >
+
+ <!-- empty element will be converted to start-end tag pair -->
+ <empty/>
+
+ <!-- text will be transcoded to UTF-8 -->
+ <transcode>1</transcode>
+
+ <!-- whitespace inside tag will be preserved -->
+ <whitespace> 321 </whitespace>
+
+ <!-- empty namespace will be removed of child tag -->
+ <emptyns xmlns="" >
+ <emptyns_child xmlns=""></emptyns_child>
+ </emptyns>
+
+ <!-- CDATA section will be replaced by its value -->
+ <compute><![CDATA[value>"0" && value<"10" ?"valid":"error"]]></compute>
+ </foo> <!-- comment outside root element --> ');
+SELECT xmlcanonicalize(doc, true) FROM xmlcanonicalize_test;
+ xmlcanonicalize
+-------------------------------------------------------------------------------------------------------------------------------------------------
+ <!-- attributes and namespaces will be sorted --> +
+ <foo xmlns="http://example.org" xmlns:a="http://www.w3.org" xmlns:b="http://www.ietf.org" attr="I am" attr2="all" b:attr="sorted" a:attr="out">+
+ +
+ <!-- Normalization of whitespace in start and end tags --> +
+ <!-- Elimination of superfluous namespace declarations, as already declared in <foo> --> +
+ <bar xmlns="">42</bar> +
+ +
+ <!-- empty element will be converted to start-end tag pair --> +
+ <empty></empty> +
+ +
+ <!-- text will be transcoded to UTF-8 --> +
+ <transcode>1</transcode> +
+ +
+ <!-- whitespace inside tag will be preserved --> +
+ <whitespace> 321 </whitespace> +
+ +
+ <!-- empty namespace will be removed of child tag --> +
+ <emptyns xmlns=""> +
+ <emptyns_child></emptyns_child> +
+ </emptyns> +
+ +
+ <!-- CDATA section will be replaced by its value --> +
+ <compute>value>"0" && value<"10" ?"valid":"error"</compute> +
+ </foo> +
+ <!-- comment outside root element -->
+(1 row)
+
+SELECT xmlcanonicalize(doc, false) FROM xmlcanonicalize_test;
+ xmlcanonicalize
+-------------------------------------------------------------------------------------------------------------------------------------------------
+ <foo xmlns="http://example.org" xmlns:a="http://www.w3.org" xmlns:b="http://www.ietf.org" attr="I am" attr2="all" b:attr="sorted" a:attr="out">+
+ +
+ +
+ +
+ <bar xmlns="">42</bar> +
+ +
+ +
+ <empty></empty> +
+ +
+ +
+ <transcode>1</transcode> +
+ +
+ +
+ <whitespace> 321 </whitespace> +
+ +
+ +
+ <emptyns xmlns=""> +
+ <emptyns_child></emptyns_child> +
+ </emptyns> +
+ +
+ +
+ <compute>value>"0" && value<"10" ?"valid":"error"</compute> +
+ </foo>
+(1 row)
+
+SELECT xmlcanonicalize(doc, true) = xmlcanonicalize(doc) FROM xmlcanonicalize_test;
+ ?column?
+----------
+ t
+(1 row)
+
+SELECT xmlcanonicalize(xmlcanonicalize(doc, true)::xml, true) = xmlcanonicalize(doc, true) FROM xmlcanonicalize_test;
+ ?column?
+----------
+ t
+(1 row)
+
+SELECT xmlcanonicalize(doc, NULL) FROM xmlcanonicalize_test;
+ xmlcanonicalize
+-----------------
+
+(1 row)
+
+SELECT xmlcanonicalize(NULL, true);
+ xmlcanonicalize
+-----------------
+
+(1 row)
+
+\set VERBOSITY terse
+SELECT xmlcanonicalize('', true);
+ERROR: invalid XML document
+SELECT xmlcanonicalize(' ', true);
+ERROR: invalid XML document
+SELECT xmlcanonicalize('foo', true);
+ERROR: invalid XML document
+SELECT xmlcanonicalize('');
+ERROR: invalid XML document
+SELECT xmlcanonicalize(' ');
+ERROR: invalid XML document
+SELECT xmlcanonicalize('foo');
+ERROR: invalid XML document
+\set VERBOSITY default
diff --git a/src/test/regress/sql/xml.sql b/src/test/regress/sql/xml.sql
index aafd39433a6..4d198d517e8 100644
--- a/src/test/regress/sql/xml.sql
+++ b/src/test/regress/sql/xml.sql
@@ -685,3 +685,56 @@ SELECT xmltext(' ');
SELECT xmltext('foo `$_-+?=*^%!|/\()[]{}');
SELECT xmltext('foo & <"bar">');
SELECT xmltext('x'|| '<P>73</P>'::xml || .42 || true || 'j'::char);
+
+-- xmlcanonicalize
+CREATE TABLE xmlcanonicalize_test (doc xml);
+INSERT INTO xmlcanonicalize_test VALUES
+ ('<?xml version="1.0" encoding="ISO-8859-1"?>
+ <!DOCTYPE doc SYSTEM "doc.dtd" [
+ <!ENTITY val "42">
+ <!ATTLIST xyz attr CDATA "default">
+ ]>
+
+ <!-- attributes and namespaces will be sorted -->
+ <foo a:attr="out" b:attr="sorted" attr2="all" attr="I am"
+ xmlns:b="http://www.ietf.org"
+ xmlns:a="http://www.w3.org"
+ xmlns="http://example.org">
+
+ <!-- Normalization of whitespace in start and end tags -->
+ <!-- Elimination of superfluous namespace declarations, as already declared in <foo> -->
+ <bar xmlns="" xmlns:a="http://www.w3.org" >&val;</bar >
+
+ <!-- empty element will be converted to start-end tag pair -->
+ <empty/>
+
+ <!-- text will be transcoded to UTF-8 -->
+ <transcode>1</transcode>
+
+ <!-- whitespace inside tag will be preserved -->
+ <whitespace> 321 </whitespace>
+
+ <!-- empty namespace will be removed of child tag -->
+ <emptyns xmlns="" >
+ <emptyns_child xmlns=""></emptyns_child>
+ </emptyns>
+
+ <!-- CDATA section will be replaced by its value -->
+ <compute><![CDATA[value>"0" && value<"10" ?"valid":"error"]]></compute>
+ </foo> <!-- comment outside root element --> ');
+
+SELECT xmlcanonicalize(doc, true) FROM xmlcanonicalize_test;
+SELECT xmlcanonicalize(doc, false) FROM xmlcanonicalize_test;
+SELECT xmlcanonicalize(doc, true) = xmlcanonicalize(doc) FROM xmlcanonicalize_test;
+SELECT xmlcanonicalize(xmlcanonicalize(doc, true)::xml, true) = xmlcanonicalize(doc, true) FROM xmlcanonicalize_test;
+SELECT xmlcanonicalize(doc, NULL) FROM xmlcanonicalize_test;
+SELECT xmlcanonicalize(NULL, true);
+
+\set VERBOSITY terse
+SELECT xmlcanonicalize('', true);
+SELECT xmlcanonicalize(' ', true);
+SELECT xmlcanonicalize('foo', true);
+SELECT xmlcanonicalize('');
+SELECT xmlcanonicalize(' ');
+SELECT xmlcanonicalize('foo');
+\set VERBOSITY default
--
2.54.0