On 12.09.24 12:56, Jim Jones wrote:
> v14 attached adds the function xmlcanonicalize, as suggested.

rebase.

Best regards, Jim
From 2121e54145eae40c8a6c172038ca3a4d07f9b78a Mon Sep 17 00:00:00 2001
From: Jim Jones <jim.jo...@uni-muenster.de>
Date: Mon, 17 Feb 2025 12:05:34 +0100
Subject: [PATCH v15] Add xmlcanonicalize function

This patch adds the xmlcanonicalize function, which transforms an
XML document into its canonical form according to the W3C Canonical
XML Version 1.1 specification.

xmlcanonicalize(doc xml, keep_comments boolean) -> xml

* doc: The XML document to be canonicalized.
* keep_comments: A flag indicating whether to preserve or discard
  XML comments from the input document.

This implementation is based on the xmlC14NDocDumpMemory function
from the C14N module of libxml2.
---
 doc/src/sgml/func.sgml              | 48 ++++++++++++++++++++
 src/backend/utils/adt/xml.c         | 43 ++++++++++++++++++
 src/include/catalog/pg_proc.dat     |  3 ++
 src/test/regress/expected/xml.out   | 70 +++++++++++++++++++++++++++++
 src/test/regress/expected/xml_1.out | 69 ++++++++++++++++++++++++++++
 src/test/regress/expected/xml_2.out | 70 +++++++++++++++++++++++++++++
 src/test/regress/sql/xml.sql        | 49 ++++++++++++++++++++
 7 files changed, 352 insertions(+)

diff --git a/doc/src/sgml/func.sgml b/doc/src/sgml/func.sgml
index 7efc81936a..bba9b7591d 100644
--- a/doc/src/sgml/func.sgml
+++ b/doc/src/sgml/func.sgml
@@ -14436,6 +14436,54 @@ SELECT xmltext('< foo & bar >');
     </para>
    </sect3>
 
+<sect3 id="functions-producing-xml-xmlcanonicalize">
+    <title><literal>xmlcanonicalize</literal></title>
+
+    <indexterm>
+     <primary>xmlcanonicalize</primary>
+    </indexterm>
+
+<synopsis>
+<function>xmlcanonicalize</function> ( <parameter>doc</parameter> <type>xml</type>, <parameter>keep_comments</parameter> <type>boolean</type> ) <returnvalue>xml</returnvalue>
+</synopsis>
+
+    <para>
+     This function transforms a given XML document into its <ulink url="https://www.w3.org/TR/xml-c14n11/#Terminology";>canonical form</ulink>,
+     as defined by the <ulink url="https://www.w3.org/TR/xml-c14n11/";>W3C Canonical XML 1.1 Specification</ulink>, which standardizes the document's
+     structure and syntax to facilitate comparison and validation.
+     The <parameter>keep_comments</parameter> parameter controls whether XML comments from the input document are preserved or discarded.
+    </para>
+
+    <para>
+     Example:
+<screen><![CDATA[
+SELECT
+  xmlcanonicalize(
+    '<foo>
+       <!-- a comment -->
+       <bar c="3" b="2" a="1">42</bar>
+       <empty/>
+     </foo>'::xml, true);
+                               xmlcanonicalize
+-----------------------------------------------------------------------------
+ <foo><!-- a comment --><bar a="1" b="2" c="3">42</bar><empty></empty></foo>
+(1 row)
+
+SELECT
+  xmlcanonicalize(
+    '<foo>
+       <!-- a comment -->
+       <bar c="3" b="2" a="1">42</bar>
+       <empty/>
+     </foo>'::xml, false);
+                      xmlcanonicalize
+-----------------------------------------------------------
+ <foo><bar a="1" b="2" c="3">42</bar><empty></empty></foo>
+(1 row)
+]]></screen>
+    </para>
+   </sect3>
+
    <sect3 id="functions-producing-xml-xmlcomment">
     <title><literal>xmlcomment</literal></title>
 
diff --git a/src/backend/utils/adt/xml.c b/src/backend/utils/adt/xml.c
index db8d0d6a7e..fb956710b8 100644
--- a/src/backend/utils/adt/xml.c
+++ b/src/backend/utils/adt/xml.c
@@ -58,6 +58,7 @@
 #include <libxml/xmlwriter.h>
 #include <libxml/xpath.h>
 #include <libxml/xpathInternals.h>
+#include <libxml/c14n.h>
 
 /*
  * We used to check for xmlStructuredErrorContext via a configure test; but
@@ -544,6 +545,48 @@ xmltext(PG_FUNCTION_ARGS)
 #endif							/* not USE_LIBXML */
 }
 
+/**
+ * Converts an XML document to its canonical form according to the
+ * W3C Canonical XML 1.1 specification implemented on xmlC14NDocDumpMemory.
+ */
+Datum
+xmlcanonicalize(PG_FUNCTION_ARGS)
+{
+#ifdef USE_LIBXML
+	xmltype    *arg = PG_GETARG_XML_P(0);
+	bool		keep_comments = PG_GETARG_BOOL(1);
+	text	   *result;
+	int			nbytes;
+	xmlDocPtr	doc;
+	xmlChar    *xmlbuf = NULL;
+
+	doc = xml_parse(arg, XMLOPTION_DOCUMENT, false,
+					GetDatabaseEncoding(), NULL, NULL, NULL);
+
+	/*
+	 * This dumps the canonicalized XML doc into the xmlChar* buffer.
+	 * mode = 2 means the doc will be canonicalized using the C14N 1.1 standard.
+	 */
+	nbytes = xmlC14NDocDumpMemory(doc, NULL, 2, NULL, keep_comments, &xmlbuf);
+
+	if(doc)
+		xmlFreeDoc(doc);
+
+	if(nbytes < 0)
+		ereport(ERROR,
+			(errcode(ERRCODE_INTERNAL_ERROR),
+			errmsg("could not canonicalize the given XML document")));
+
+	result = cstring_to_text_with_len((const char *) xmlbuf, nbytes);
+
+	xmlFree(xmlbuf);
+
+	PG_RETURN_XML_P(result);
+#else
+	NO_XML_SUPPORT();
+	return 0;
+#endif							/* not USE_LIBXML */
+}
 
 /*
  * TODO: xmlconcat needs to merge the notations and unparsed entities
diff --git a/src/include/catalog/pg_proc.dat b/src/include/catalog/pg_proc.dat
index 9e803d610d..5989d2936b 100644
--- a/src/include/catalog/pg_proc.dat
+++ b/src/include/catalog/pg_proc.dat
@@ -9042,6 +9042,9 @@
 { oid => '3813', descr => 'generate XML text node',
   proname => 'xmltext', prorettype => 'xml', proargtypes => 'text',
   prosrc => 'xmltext' },
+{ oid => '3814', descr => 'generate the canonical form of an XML document',
+  proname => 'xmlcanonicalize', prorettype => 'xml', proargtypes => 'xml bool',
+  prosrc => 'xmlcanonicalize' },
 
 { oid => '2923', descr => 'map table contents to XML',
   proname => 'table_to_xml', procost => '100', provolatile => 's',
diff --git a/src/test/regress/expected/xml.out b/src/test/regress/expected/xml.out
index 2e9616acda..af17094cca 100644
--- a/src/test/regress/expected/xml.out
+++ b/src/test/regress/expected/xml.out
@@ -1873,3 +1873,73 @@ SELECT xmltext('x'|| '<P>73</P>'::xml || .42 || true || 'j'::char);
  x&lt;P&gt;73&lt;/P&gt;0.42truej
 (1 row)
 
+-- xmlserialize: canonical
+CREATE TABLE xmlcanonicalize_test (doc xml);
+INSERT INTO xmlcanonicalize_test VALUES
+  ('<?xml version="1.0" encoding="ISO-8859-1"?>
+  <!DOCTYPE doc SYSTEM "doc.dtd" [
+                  <!ENTITY val "42">
+      <!ATTLIST xyz attr CDATA "default">
+  ]>
+
+  <!-- attributes and namespces will be sorted -->
+  <foo a:attr="out" b:attr="sorted" attr2="all" attr="I am"
+      xmlns:b="http://www.ietf.org";
+      xmlns:a="http://www.w3.org";
+      xmlns="http://example.org";>
+
+    <!-- Normalization of whitespace in start and end tags -->
+    <!-- Elimination of superfluous namespace declarations, as already declared in <foo> -->
+    <bar     xmlns="" xmlns:a="http://www.w3.org";     >&val;</bar     >
+
+    <!-- empty element will be converted to start-end tag pair -->
+    <empty/>
+
+    <!-- text will be transcoded to UTF-8 -->
+    <transcode>&#49;</transcode>
+
+    <!-- whitespace inside tag will be preserved -->
+    <whitespace> 321 </whitespace>
+
+    <!-- empty namespace will be removed of child tag -->
+    <emptyns  xmlns="" >
+       <emptyns_child xmlns=""></emptyns_child>
+    </emptyns>
+
+    <!-- CDATA section will be replaced by its value -->
+    <compute><![CDATA[value>"0" && value<"10" ?"valid":"error"]]></compute>
+  </foo>      <!-- comment outside root element -->          ');
+SELECT xmlcanonicalize(doc, true) FROM xmlcanonicalize_test;
+                                                                                                                                                                                                                                                                                                                                                                                            xmlcanonicalize                                                                                                                                                                                                                                                                                                                                                                                             
+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
+ <!-- attributes and namespces will be sorted -->                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                      +
+ <foo xmlns="http://example.org"; xmlns:a="http://www.w3.org"; xmlns:b="http://www.ietf.org"; attr="I am" attr2="all" b:attr="sorted" a:attr="out"><!-- Normalization of whitespace in start and end tags --><!-- Elimination of superfluous namespace declarations, as already declared in <foo> --><bar xmlns="">42</bar><!-- empty element will be converted to start-end tag pair --><empty></empty><!-- text will be transcoded to UTF-8 --><transcode>1</transcode><!-- whitespace inside tag will be preserved --><whitespace> 321 </whitespace><!-- empty namespace will be removed of child tag --><emptyns xmlns=""><emptyns_child></emptyns_child></emptyns><!-- CDATA section will be replaced by its value --><compute>value&gt;"0" &amp;&amp; value&lt;"10" ?"valid":"error"</compute></foo>+
+ <!-- comment outside root element -->
+(1 row)
+
+SELECT xmlcanonicalize(doc, false) FROM xmlcanonicalize_test;
+                                                                                                                                                                                   xmlcanonicalize                                                                                                                                                                                    
+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
+ <foo xmlns="http://example.org"; xmlns:a="http://www.w3.org"; xmlns:b="http://www.ietf.org"; attr="I am" attr2="all" b:attr="sorted" a:attr="out"><bar xmlns="">42</bar><empty></empty><transcode>1</transcode><whitespace> 321 </whitespace><emptyns xmlns=""><emptyns_child></emptyns_child></emptyns><compute>value&gt;"0" &amp;&amp; value&lt;"10" ?"valid":"error"</compute></foo>
+(1 row)
+
+SELECT xmlcanonicalize(doc, NULL) FROM xmlcanonicalize_test;
+ xmlcanonicalize 
+-----------------
+ 
+(1 row)
+
+SELECT xmlcanonicalize(NULL, true);
+ xmlcanonicalize 
+-----------------
+ 
+(1 row)
+
+\set VERBOSITY terse
+SELECT xmlcanonicalize('', true);
+ERROR:  invalid XML document
+SELECT xmlcanonicalize('  ', true);
+ERROR:  invalid XML document
+SELECT xmlcanonicalize('foo', true);
+ERROR:  invalid XML document
+\set VERBOSITY default
diff --git a/src/test/regress/expected/xml_1.out b/src/test/regress/expected/xml_1.out
index 7505a14077..a7f10a5036 100644
--- a/src/test/regress/expected/xml_1.out
+++ b/src/test/regress/expected/xml_1.out
@@ -1482,3 +1482,72 @@ ERROR:  unsupported XML feature
 LINE 1: SELECT xmltext('x'|| '<P>73</P>'::xml || .42 || true || 'j':...
                              ^
 DETAIL:  This functionality requires the server to be built with libxml support.
+-- xmlserialize: canonical
+CREATE TABLE xmlcanonicalize_test (doc xml);
+INSERT INTO xmlcanonicalize_test VALUES
+  ('<?xml version="1.0" encoding="ISO-8859-1"?>
+  <!DOCTYPE doc SYSTEM "doc.dtd" [
+                  <!ENTITY val "42">
+      <!ATTLIST xyz attr CDATA "default">
+  ]>
+
+  <!-- attributes and namespces will be sorted -->
+  <foo a:attr="out" b:attr="sorted" attr2="all" attr="I am"
+      xmlns:b="http://www.ietf.org";
+      xmlns:a="http://www.w3.org";
+      xmlns="http://example.org";>
+
+    <!-- Normalization of whitespace in start and end tags -->
+    <!-- Elimination of superfluous namespace declarations, as already declared in <foo> -->
+    <bar     xmlns="" xmlns:a="http://www.w3.org";     >&val;</bar     >
+
+    <!-- empty element will be converted to start-end tag pair -->
+    <empty/>
+
+    <!-- text will be transcoded to UTF-8 -->
+    <transcode>&#49;</transcode>
+
+    <!-- whitespace inside tag will be preserved -->
+    <whitespace> 321 </whitespace>
+
+    <!-- empty namespace will be removed of child tag -->
+    <emptyns  xmlns="" >
+       <emptyns_child xmlns=""></emptyns_child>
+    </emptyns>
+
+    <!-- CDATA section will be replaced by its value -->
+    <compute><![CDATA[value>"0" && value<"10" ?"valid":"error"]]></compute>
+  </foo>      <!-- comment outside root element -->          ');
+ERROR:  unsupported XML feature
+LINE 2:   ('<?xml version="1.0" encoding="ISO-8859-1"?>
+           ^
+DETAIL:  This functionality requires the server to be built with libxml support.
+SELECT xmlcanonicalize(doc, true) FROM xmlcanonicalize_test;
+ xmlcanonicalize 
+-----------------
+(0 rows)
+
+SELECT xmlcanonicalize(doc, false) FROM xmlcanonicalize_test;
+ xmlcanonicalize 
+-----------------
+(0 rows)
+
+SELECT xmlcanonicalize(doc, NULL) FROM xmlcanonicalize_test;
+ xmlcanonicalize 
+-----------------
+(0 rows)
+
+SELECT xmlcanonicalize(NULL, true);
+ xmlcanonicalize 
+-----------------
+ 
+(1 row)
+
+\set VERBOSITY terse
+SELECT xmlcanonicalize('', true);
+ERROR:  unsupported XML feature at character 24
+SELECT xmlcanonicalize('  ', true);
+ERROR:  unsupported XML feature at character 24
+SELECT xmlcanonicalize('foo', true);
+ERROR:  unsupported XML feature at character 24
+\set VERBOSITY default
diff --git a/src/test/regress/expected/xml_2.out b/src/test/regress/expected/xml_2.out
index c07ed2b269..425e28c528 100644
--- a/src/test/regress/expected/xml_2.out
+++ b/src/test/regress/expected/xml_2.out
@@ -1859,3 +1859,73 @@ SELECT xmltext('x'|| '<P>73</P>'::xml || .42 || true || 'j'::char);
  x&lt;P&gt;73&lt;/P&gt;0.42truej
 (1 row)
 
+-- xmlserialize: canonical
+CREATE TABLE xmlcanonicalize_test (doc xml);
+INSERT INTO xmlcanonicalize_test VALUES
+  ('<?xml version="1.0" encoding="ISO-8859-1"?>
+  <!DOCTYPE doc SYSTEM "doc.dtd" [
+                  <!ENTITY val "42">
+      <!ATTLIST xyz attr CDATA "default">
+  ]>
+
+  <!-- attributes and namespces will be sorted -->
+  <foo a:attr="out" b:attr="sorted" attr2="all" attr="I am"
+      xmlns:b="http://www.ietf.org";
+      xmlns:a="http://www.w3.org";
+      xmlns="http://example.org";>
+
+    <!-- Normalization of whitespace in start and end tags -->
+    <!-- Elimination of superfluous namespace declarations, as already declared in <foo> -->
+    <bar     xmlns="" xmlns:a="http://www.w3.org";     >&val;</bar     >
+
+    <!-- empty element will be converted to start-end tag pair -->
+    <empty/>
+
+    <!-- text will be transcoded to UTF-8 -->
+    <transcode>&#49;</transcode>
+
+    <!-- whitespace inside tag will be preserved -->
+    <whitespace> 321 </whitespace>
+
+    <!-- empty namespace will be removed of child tag -->
+    <emptyns  xmlns="" >
+       <emptyns_child xmlns=""></emptyns_child>
+    </emptyns>
+
+    <!-- CDATA section will be replaced by its value -->
+    <compute><![CDATA[value>"0" && value<"10" ?"valid":"error"]]></compute>
+  </foo>      <!-- comment outside root element -->          ');
+SELECT xmlcanonicalize(doc, true) FROM xmlcanonicalize_test;
+                                                                                                                                                                                                                                                                                                                                                                                            xmlcanonicalize                                                                                                                                                                                                                                                                                                                                                                                             
+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
+ <!-- attributes and namespces will be sorted -->                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                      +
+ <foo xmlns="http://example.org"; xmlns:a="http://www.w3.org"; xmlns:b="http://www.ietf.org"; attr="I am" attr2="all" b:attr="sorted" a:attr="out"><!-- Normalization of whitespace in start and end tags --><!-- Elimination of superfluous namespace declarations, as already declared in <foo> --><bar xmlns="">42</bar><!-- empty element will be converted to start-end tag pair --><empty></empty><!-- text will be transcoded to UTF-8 --><transcode>1</transcode><!-- whitespace inside tag will be preserved --><whitespace> 321 </whitespace><!-- empty namespace will be removed of child tag --><emptyns xmlns=""><emptyns_child></emptyns_child></emptyns><!-- CDATA section will be replaced by its value --><compute>value&gt;"0" &amp;&amp; value&lt;"10" ?"valid":"error"</compute></foo>+
+ <!-- comment outside root element -->
+(1 row)
+
+SELECT xmlcanonicalize(doc, false) FROM xmlcanonicalize_test;
+                                                                                                                                                                                   xmlcanonicalize                                                                                                                                                                                    
+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
+ <foo xmlns="http://example.org"; xmlns:a="http://www.w3.org"; xmlns:b="http://www.ietf.org"; attr="I am" attr2="all" b:attr="sorted" a:attr="out"><bar xmlns="">42</bar><empty></empty><transcode>1</transcode><whitespace> 321 </whitespace><emptyns xmlns=""><emptyns_child></emptyns_child></emptyns><compute>value&gt;"0" &amp;&amp; value&lt;"10" ?"valid":"error"</compute></foo>
+(1 row)
+
+SELECT xmlcanonicalize(doc, NULL) FROM xmlcanonicalize_test;
+ xmlcanonicalize 
+-----------------
+ 
+(1 row)
+
+SELECT xmlcanonicalize(NULL, true);
+ xmlcanonicalize 
+-----------------
+ 
+(1 row)
+
+\set VERBOSITY terse
+SELECT xmlcanonicalize('', true);
+ERROR:  invalid XML document
+SELECT xmlcanonicalize('  ', true);
+ERROR:  invalid XML document
+SELECT xmlcanonicalize('foo', true);
+ERROR:  invalid XML document
+\set VERBOSITY default
diff --git a/src/test/regress/sql/xml.sql b/src/test/regress/sql/xml.sql
index bac0388ac1..ec04d1d56d 100644
--- a/src/test/regress/sql/xml.sql
+++ b/src/test/regress/sql/xml.sql
@@ -675,3 +675,52 @@ SELECT xmltext('  ');
 SELECT xmltext('foo `$_-+?=*^%!|/\()[]{}');
 SELECT xmltext('foo & <"bar">');
 SELECT xmltext('x'|| '<P>73</P>'::xml || .42 || true || 'j'::char);
+
+-- xmlserialize: canonical
+CREATE TABLE xmlcanonicalize_test (doc xml);
+INSERT INTO xmlcanonicalize_test VALUES
+  ('<?xml version="1.0" encoding="ISO-8859-1"?>
+  <!DOCTYPE doc SYSTEM "doc.dtd" [
+                  <!ENTITY val "42">
+      <!ATTLIST xyz attr CDATA "default">
+  ]>
+
+  <!-- attributes and namespces will be sorted -->
+  <foo a:attr="out" b:attr="sorted" attr2="all" attr="I am"
+      xmlns:b="http://www.ietf.org";
+      xmlns:a="http://www.w3.org";
+      xmlns="http://example.org";>
+
+    <!-- Normalization of whitespace in start and end tags -->
+    <!-- Elimination of superfluous namespace declarations, as already declared in <foo> -->
+    <bar     xmlns="" xmlns:a="http://www.w3.org";     >&val;</bar     >
+
+    <!-- empty element will be converted to start-end tag pair -->
+    <empty/>
+
+    <!-- text will be transcoded to UTF-8 -->
+    <transcode>&#49;</transcode>
+
+    <!-- whitespace inside tag will be preserved -->
+    <whitespace> 321 </whitespace>
+
+    <!-- empty namespace will be removed of child tag -->
+    <emptyns  xmlns="" >
+       <emptyns_child xmlns=""></emptyns_child>
+    </emptyns>
+
+    <!-- CDATA section will be replaced by its value -->
+    <compute><![CDATA[value>"0" && value<"10" ?"valid":"error"]]></compute>
+  </foo>      <!-- comment outside root element -->          ');
+
+SELECT xmlcanonicalize(doc, true) FROM xmlcanonicalize_test;
+SELECT xmlcanonicalize(doc, false) FROM xmlcanonicalize_test;
+
+SELECT xmlcanonicalize(doc, NULL) FROM xmlcanonicalize_test;
+SELECT xmlcanonicalize(NULL, true);
+
+\set VERBOSITY terse
+SELECT xmlcanonicalize('', true);
+SELECT xmlcanonicalize('  ', true);
+SELECT xmlcanonicalize('foo', true);
+\set VERBOSITY default
\ No newline at end of file
-- 
2.34.1

Reply via email to