On 12.09.24 12:56, Jim Jones wrote: > v14 attached adds the function xmlcanonicalize, as suggested.
rebase. Best regards, Jim
From 2121e54145eae40c8a6c172038ca3a4d07f9b78a Mon Sep 17 00:00:00 2001 From: Jim Jones <jim.jo...@uni-muenster.de> Date: Mon, 17 Feb 2025 12:05:34 +0100 Subject: [PATCH v15] Add xmlcanonicalize function This patch adds the xmlcanonicalize function, which transforms an XML document into its canonical form according to the W3C Canonical XML Version 1.1 specification. xmlcanonicalize(doc xml, keep_comments boolean) -> xml * doc: The XML document to be canonicalized. * keep_comments: A flag indicating whether to preserve or discard XML comments from the input document. This implementation is based on the xmlC14NDocDumpMemory function from the C14N module of libxml2. --- doc/src/sgml/func.sgml | 48 ++++++++++++++++++++ src/backend/utils/adt/xml.c | 43 ++++++++++++++++++ src/include/catalog/pg_proc.dat | 3 ++ src/test/regress/expected/xml.out | 70 +++++++++++++++++++++++++++++ src/test/regress/expected/xml_1.out | 69 ++++++++++++++++++++++++++++ src/test/regress/expected/xml_2.out | 70 +++++++++++++++++++++++++++++ src/test/regress/sql/xml.sql | 49 ++++++++++++++++++++ 7 files changed, 352 insertions(+) diff --git a/doc/src/sgml/func.sgml b/doc/src/sgml/func.sgml index 7efc81936a..bba9b7591d 100644 --- a/doc/src/sgml/func.sgml +++ b/doc/src/sgml/func.sgml @@ -14436,6 +14436,54 @@ SELECT xmltext('< foo & bar >'); </para> </sect3> +<sect3 id="functions-producing-xml-xmlcanonicalize"> + <title><literal>xmlcanonicalize</literal></title> + + <indexterm> + <primary>xmlcanonicalize</primary> + </indexterm> + +<synopsis> +<function>xmlcanonicalize</function> ( <parameter>doc</parameter> <type>xml</type>, <parameter>keep_comments</parameter> <type>boolean</type> ) <returnvalue>xml</returnvalue> +</synopsis> + + <para> + This function transforms a given XML document into its <ulink url="https://www.w3.org/TR/xml-c14n11/#Terminology">canonical form</ulink>, + as defined by the <ulink url="https://www.w3.org/TR/xml-c14n11/">W3C Canonical XML 1.1 Specification</ulink>, which standardizes the document's + structure and syntax to facilitate comparison and validation. + The <parameter>keep_comments</parameter> parameter controls whether XML comments from the input document are preserved or discarded. + </para> + + <para> + Example: +<screen><![CDATA[ +SELECT + xmlcanonicalize( + '<foo> + <!-- a comment --> + <bar c="3" b="2" a="1">42</bar> + <empty/> + </foo>'::xml, true); + xmlcanonicalize +----------------------------------------------------------------------------- + <foo><!-- a comment --><bar a="1" b="2" c="3">42</bar><empty></empty></foo> +(1 row) + +SELECT + xmlcanonicalize( + '<foo> + <!-- a comment --> + <bar c="3" b="2" a="1">42</bar> + <empty/> + </foo>'::xml, false); + xmlcanonicalize +----------------------------------------------------------- + <foo><bar a="1" b="2" c="3">42</bar><empty></empty></foo> +(1 row) +]]></screen> + </para> + </sect3> + <sect3 id="functions-producing-xml-xmlcomment"> <title><literal>xmlcomment</literal></title> diff --git a/src/backend/utils/adt/xml.c b/src/backend/utils/adt/xml.c index db8d0d6a7e..fb956710b8 100644 --- a/src/backend/utils/adt/xml.c +++ b/src/backend/utils/adt/xml.c @@ -58,6 +58,7 @@ #include <libxml/xmlwriter.h> #include <libxml/xpath.h> #include <libxml/xpathInternals.h> +#include <libxml/c14n.h> /* * We used to check for xmlStructuredErrorContext via a configure test; but @@ -544,6 +545,48 @@ xmltext(PG_FUNCTION_ARGS) #endif /* not USE_LIBXML */ } +/** + * Converts an XML document to its canonical form according to the + * W3C Canonical XML 1.1 specification implemented on xmlC14NDocDumpMemory. + */ +Datum +xmlcanonicalize(PG_FUNCTION_ARGS) +{ +#ifdef USE_LIBXML + xmltype *arg = PG_GETARG_XML_P(0); + bool keep_comments = PG_GETARG_BOOL(1); + text *result; + int nbytes; + xmlDocPtr doc; + xmlChar *xmlbuf = NULL; + + doc = xml_parse(arg, XMLOPTION_DOCUMENT, false, + GetDatabaseEncoding(), NULL, NULL, NULL); + + /* + * This dumps the canonicalized XML doc into the xmlChar* buffer. + * mode = 2 means the doc will be canonicalized using the C14N 1.1 standard. + */ + nbytes = xmlC14NDocDumpMemory(doc, NULL, 2, NULL, keep_comments, &xmlbuf); + + if(doc) + xmlFreeDoc(doc); + + if(nbytes < 0) + ereport(ERROR, + (errcode(ERRCODE_INTERNAL_ERROR), + errmsg("could not canonicalize the given XML document"))); + + result = cstring_to_text_with_len((const char *) xmlbuf, nbytes); + + xmlFree(xmlbuf); + + PG_RETURN_XML_P(result); +#else + NO_XML_SUPPORT(); + return 0; +#endif /* not USE_LIBXML */ +} /* * TODO: xmlconcat needs to merge the notations and unparsed entities diff --git a/src/include/catalog/pg_proc.dat b/src/include/catalog/pg_proc.dat index 9e803d610d..5989d2936b 100644 --- a/src/include/catalog/pg_proc.dat +++ b/src/include/catalog/pg_proc.dat @@ -9042,6 +9042,9 @@ { oid => '3813', descr => 'generate XML text node', proname => 'xmltext', prorettype => 'xml', proargtypes => 'text', prosrc => 'xmltext' }, +{ oid => '3814', descr => 'generate the canonical form of an XML document', + proname => 'xmlcanonicalize', prorettype => 'xml', proargtypes => 'xml bool', + prosrc => 'xmlcanonicalize' }, { oid => '2923', descr => 'map table contents to XML', proname => 'table_to_xml', procost => '100', provolatile => 's', diff --git a/src/test/regress/expected/xml.out b/src/test/regress/expected/xml.out index 2e9616acda..af17094cca 100644 --- a/src/test/regress/expected/xml.out +++ b/src/test/regress/expected/xml.out @@ -1873,3 +1873,73 @@ SELECT xmltext('x'|| '<P>73</P>'::xml || .42 || true || 'j'::char); x<P>73</P>0.42truej (1 row) +-- xmlserialize: canonical +CREATE TABLE xmlcanonicalize_test (doc xml); +INSERT INTO xmlcanonicalize_test VALUES + ('<?xml version="1.0" encoding="ISO-8859-1"?> + <!DOCTYPE doc SYSTEM "doc.dtd" [ + <!ENTITY val "42"> + <!ATTLIST xyz attr CDATA "default"> + ]> + + <!-- attributes and namespces will be sorted --> + <foo a:attr="out" b:attr="sorted" attr2="all" attr="I am" + xmlns:b="http://www.ietf.org" + xmlns:a="http://www.w3.org" + xmlns="http://example.org"> + + <!-- Normalization of whitespace in start and end tags --> + <!-- Elimination of superfluous namespace declarations, as already declared in <foo> --> + <bar xmlns="" xmlns:a="http://www.w3.org" >&val;</bar > + + <!-- empty element will be converted to start-end tag pair --> + <empty/> + + <!-- text will be transcoded to UTF-8 --> + <transcode>1</transcode> + + <!-- whitespace inside tag will be preserved --> + <whitespace> 321 </whitespace> + + <!-- empty namespace will be removed of child tag --> + <emptyns xmlns="" > + <emptyns_child xmlns=""></emptyns_child> + </emptyns> + + <!-- CDATA section will be replaced by its value --> + <compute><![CDATA[value>"0" && value<"10" ?"valid":"error"]]></compute> + </foo> <!-- comment outside root element --> '); +SELECT xmlcanonicalize(doc, true) FROM xmlcanonicalize_test; + xmlcanonicalize +-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- + <!-- attributes and namespces will be sorted --> + + <foo xmlns="http://example.org" xmlns:a="http://www.w3.org" xmlns:b="http://www.ietf.org" attr="I am" attr2="all" b:attr="sorted" a:attr="out"><!-- Normalization of whitespace in start and end tags --><!-- Elimination of superfluous namespace declarations, as already declared in <foo> --><bar xmlns="">42</bar><!-- empty element will be converted to start-end tag pair --><empty></empty><!-- text will be transcoded to UTF-8 --><transcode>1</transcode><!-- whitespace inside tag will be preserved --><whitespace> 321 </whitespace><!-- empty namespace will be removed of child tag --><emptyns xmlns=""><emptyns_child></emptyns_child></emptyns><!-- CDATA section will be replaced by its value --><compute>value>"0" && value<"10" ?"valid":"error"</compute></foo>+ + <!-- comment outside root element --> +(1 row) + +SELECT xmlcanonicalize(doc, false) FROM xmlcanonicalize_test; + xmlcanonicalize +-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- + <foo xmlns="http://example.org" xmlns:a="http://www.w3.org" xmlns:b="http://www.ietf.org" attr="I am" attr2="all" b:attr="sorted" a:attr="out"><bar xmlns="">42</bar><empty></empty><transcode>1</transcode><whitespace> 321 </whitespace><emptyns xmlns=""><emptyns_child></emptyns_child></emptyns><compute>value>"0" && value<"10" ?"valid":"error"</compute></foo> +(1 row) + +SELECT xmlcanonicalize(doc, NULL) FROM xmlcanonicalize_test; + xmlcanonicalize +----------------- + +(1 row) + +SELECT xmlcanonicalize(NULL, true); + xmlcanonicalize +----------------- + +(1 row) + +\set VERBOSITY terse +SELECT xmlcanonicalize('', true); +ERROR: invalid XML document +SELECT xmlcanonicalize(' ', true); +ERROR: invalid XML document +SELECT xmlcanonicalize('foo', true); +ERROR: invalid XML document +\set VERBOSITY default diff --git a/src/test/regress/expected/xml_1.out b/src/test/regress/expected/xml_1.out index 7505a14077..a7f10a5036 100644 --- a/src/test/regress/expected/xml_1.out +++ b/src/test/regress/expected/xml_1.out @@ -1482,3 +1482,72 @@ ERROR: unsupported XML feature LINE 1: SELECT xmltext('x'|| '<P>73</P>'::xml || .42 || true || 'j':... ^ DETAIL: This functionality requires the server to be built with libxml support. +-- xmlserialize: canonical +CREATE TABLE xmlcanonicalize_test (doc xml); +INSERT INTO xmlcanonicalize_test VALUES + ('<?xml version="1.0" encoding="ISO-8859-1"?> + <!DOCTYPE doc SYSTEM "doc.dtd" [ + <!ENTITY val "42"> + <!ATTLIST xyz attr CDATA "default"> + ]> + + <!-- attributes and namespces will be sorted --> + <foo a:attr="out" b:attr="sorted" attr2="all" attr="I am" + xmlns:b="http://www.ietf.org" + xmlns:a="http://www.w3.org" + xmlns="http://example.org"> + + <!-- Normalization of whitespace in start and end tags --> + <!-- Elimination of superfluous namespace declarations, as already declared in <foo> --> + <bar xmlns="" xmlns:a="http://www.w3.org" >&val;</bar > + + <!-- empty element will be converted to start-end tag pair --> + <empty/> + + <!-- text will be transcoded to UTF-8 --> + <transcode>1</transcode> + + <!-- whitespace inside tag will be preserved --> + <whitespace> 321 </whitespace> + + <!-- empty namespace will be removed of child tag --> + <emptyns xmlns="" > + <emptyns_child xmlns=""></emptyns_child> + </emptyns> + + <!-- CDATA section will be replaced by its value --> + <compute><![CDATA[value>"0" && value<"10" ?"valid":"error"]]></compute> + </foo> <!-- comment outside root element --> '); +ERROR: unsupported XML feature +LINE 2: ('<?xml version="1.0" encoding="ISO-8859-1"?> + ^ +DETAIL: This functionality requires the server to be built with libxml support. +SELECT xmlcanonicalize(doc, true) FROM xmlcanonicalize_test; + xmlcanonicalize +----------------- +(0 rows) + +SELECT xmlcanonicalize(doc, false) FROM xmlcanonicalize_test; + xmlcanonicalize +----------------- +(0 rows) + +SELECT xmlcanonicalize(doc, NULL) FROM xmlcanonicalize_test; + xmlcanonicalize +----------------- +(0 rows) + +SELECT xmlcanonicalize(NULL, true); + xmlcanonicalize +----------------- + +(1 row) + +\set VERBOSITY terse +SELECT xmlcanonicalize('', true); +ERROR: unsupported XML feature at character 24 +SELECT xmlcanonicalize(' ', true); +ERROR: unsupported XML feature at character 24 +SELECT xmlcanonicalize('foo', true); +ERROR: unsupported XML feature at character 24 +\set VERBOSITY default diff --git a/src/test/regress/expected/xml_2.out b/src/test/regress/expected/xml_2.out index c07ed2b269..425e28c528 100644 --- a/src/test/regress/expected/xml_2.out +++ b/src/test/regress/expected/xml_2.out @@ -1859,3 +1859,73 @@ SELECT xmltext('x'|| '<P>73</P>'::xml || .42 || true || 'j'::char); x<P>73</P>0.42truej (1 row) +-- xmlserialize: canonical +CREATE TABLE xmlcanonicalize_test (doc xml); +INSERT INTO xmlcanonicalize_test VALUES + ('<?xml version="1.0" encoding="ISO-8859-1"?> + <!DOCTYPE doc SYSTEM "doc.dtd" [ + <!ENTITY val "42"> + <!ATTLIST xyz attr CDATA "default"> + ]> + + <!-- attributes and namespces will be sorted --> + <foo a:attr="out" b:attr="sorted" attr2="all" attr="I am" + xmlns:b="http://www.ietf.org" + xmlns:a="http://www.w3.org" + xmlns="http://example.org"> + + <!-- Normalization of whitespace in start and end tags --> + <!-- Elimination of superfluous namespace declarations, as already declared in <foo> --> + <bar xmlns="" xmlns:a="http://www.w3.org" >&val;</bar > + + <!-- empty element will be converted to start-end tag pair --> + <empty/> + + <!-- text will be transcoded to UTF-8 --> + <transcode>1</transcode> + + <!-- whitespace inside tag will be preserved --> + <whitespace> 321 </whitespace> + + <!-- empty namespace will be removed of child tag --> + <emptyns xmlns="" > + <emptyns_child xmlns=""></emptyns_child> + </emptyns> + + <!-- CDATA section will be replaced by its value --> + <compute><![CDATA[value>"0" && value<"10" ?"valid":"error"]]></compute> + </foo> <!-- comment outside root element --> '); +SELECT xmlcanonicalize(doc, true) FROM xmlcanonicalize_test; + xmlcanonicalize +-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- + <!-- attributes and namespces will be sorted --> + + <foo xmlns="http://example.org" xmlns:a="http://www.w3.org" xmlns:b="http://www.ietf.org" attr="I am" attr2="all" b:attr="sorted" a:attr="out"><!-- Normalization of whitespace in start and end tags --><!-- Elimination of superfluous namespace declarations, as already declared in <foo> --><bar xmlns="">42</bar><!-- empty element will be converted to start-end tag pair --><empty></empty><!-- text will be transcoded to UTF-8 --><transcode>1</transcode><!-- whitespace inside tag will be preserved --><whitespace> 321 </whitespace><!-- empty namespace will be removed of child tag --><emptyns xmlns=""><emptyns_child></emptyns_child></emptyns><!-- CDATA section will be replaced by its value --><compute>value>"0" && value<"10" ?"valid":"error"</compute></foo>+ + <!-- comment outside root element --> +(1 row) + +SELECT xmlcanonicalize(doc, false) FROM xmlcanonicalize_test; + xmlcanonicalize +-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- + <foo xmlns="http://example.org" xmlns:a="http://www.w3.org" xmlns:b="http://www.ietf.org" attr="I am" attr2="all" b:attr="sorted" a:attr="out"><bar xmlns="">42</bar><empty></empty><transcode>1</transcode><whitespace> 321 </whitespace><emptyns xmlns=""><emptyns_child></emptyns_child></emptyns><compute>value>"0" && value<"10" ?"valid":"error"</compute></foo> +(1 row) + +SELECT xmlcanonicalize(doc, NULL) FROM xmlcanonicalize_test; + xmlcanonicalize +----------------- + +(1 row) + +SELECT xmlcanonicalize(NULL, true); + xmlcanonicalize +----------------- + +(1 row) + +\set VERBOSITY terse +SELECT xmlcanonicalize('', true); +ERROR: invalid XML document +SELECT xmlcanonicalize(' ', true); +ERROR: invalid XML document +SELECT xmlcanonicalize('foo', true); +ERROR: invalid XML document +\set VERBOSITY default diff --git a/src/test/regress/sql/xml.sql b/src/test/regress/sql/xml.sql index bac0388ac1..ec04d1d56d 100644 --- a/src/test/regress/sql/xml.sql +++ b/src/test/regress/sql/xml.sql @@ -675,3 +675,52 @@ SELECT xmltext(' '); SELECT xmltext('foo `$_-+?=*^%!|/\()[]{}'); SELECT xmltext('foo & <"bar">'); SELECT xmltext('x'|| '<P>73</P>'::xml || .42 || true || 'j'::char); + +-- xmlserialize: canonical +CREATE TABLE xmlcanonicalize_test (doc xml); +INSERT INTO xmlcanonicalize_test VALUES + ('<?xml version="1.0" encoding="ISO-8859-1"?> + <!DOCTYPE doc SYSTEM "doc.dtd" [ + <!ENTITY val "42"> + <!ATTLIST xyz attr CDATA "default"> + ]> + + <!-- attributes and namespces will be sorted --> + <foo a:attr="out" b:attr="sorted" attr2="all" attr="I am" + xmlns:b="http://www.ietf.org" + xmlns:a="http://www.w3.org" + xmlns="http://example.org"> + + <!-- Normalization of whitespace in start and end tags --> + <!-- Elimination of superfluous namespace declarations, as already declared in <foo> --> + <bar xmlns="" xmlns:a="http://www.w3.org" >&val;</bar > + + <!-- empty element will be converted to start-end tag pair --> + <empty/> + + <!-- text will be transcoded to UTF-8 --> + <transcode>1</transcode> + + <!-- whitespace inside tag will be preserved --> + <whitespace> 321 </whitespace> + + <!-- empty namespace will be removed of child tag --> + <emptyns xmlns="" > + <emptyns_child xmlns=""></emptyns_child> + </emptyns> + + <!-- CDATA section will be replaced by its value --> + <compute><![CDATA[value>"0" && value<"10" ?"valid":"error"]]></compute> + </foo> <!-- comment outside root element --> '); + +SELECT xmlcanonicalize(doc, true) FROM xmlcanonicalize_test; +SELECT xmlcanonicalize(doc, false) FROM xmlcanonicalize_test; + +SELECT xmlcanonicalize(doc, NULL) FROM xmlcanonicalize_test; +SELECT xmlcanonicalize(NULL, true); + +\set VERBOSITY terse +SELECT xmlcanonicalize('', true); +SELECT xmlcanonicalize(' ', true); +SELECT xmlcanonicalize('foo', true); +\set VERBOSITY default \ No newline at end of file -- 2.34.1