Hi,

In order to compare pairs of XML documents for equivalence it is necessary to convert them first to their canonical form, as described at W3C Canonical XML 1.1.[1] This spec basically defines a standard physical representation of xml documents that have more then one possible representation, so that it is possible to compare them, e.g. forcing UTF-8 encoding, entity reference replacement, attributes normalization, etc.

Although it is not part of the XML/SQL standard, it would be nice to have the option CANONICAL in xmlserialize. Additionally, we could also add the attribute WITH [NO] COMMENTS to keep or remove xml comments from the documents.

Something like this:

WITH t(col) AS (
 VALUES
  ('<?xml version="1.0" encoding="ISO-8859-1"?>
  <!DOCTYPE doc SYSTEM "doc.dtd" [
  <!ENTITY val "42">
  <!ATTLIST xyz attr CDATA "default">
  ]>

  <!-- ordering of attributes -->
  <foo ns:c = "3" ns:b = "2" ns:a = "1"
    xmlns:ns="http://postgresql.org";>

    <!-- Normalization of whitespace in start and end tags -->
    <!-- Elimination of superfluous namespace declarations,
         as already declared in <foo> -->
 <bar     xmlns:ns="http://postgresql.org"; >&val;</bar     >

    <!-- Empty element conversion to start-end tag pair -->
    <empty/>

    <!-- Effect of transcoding from a sample encoding to UTF-8 -->
    <iso8859>&#169;</iso8859>

    <!-- Addition of default attribute -->
    <!-- Whitespace inside tag preserved -->
    <xyz> 321 </xyz>
  </foo>
  <!-- comment outside doc -->'::xml)
)
SELECT xmlserialize(DOCUMENT col AS text CANONICAL) FROM t;
xmlserialize
--------------------------------------------------------------------------------------------------------------------------------------------------------
 <foo xmlns:ns="http://postgresql.org"; ns:a="1" ns:b="2" ns:c="3"><bar>42</bar><empty></empty><iso8859>©</iso8859><xyz attr="default"> 321 </xyz></foo>
(1 row)

-- using WITH COMMENTS

WITH t(col) AS (
 VALUES
  (' <foo ns:c = "3" ns:b = "2" ns:a = "1"
    xmlns:ns="http://postgresql.org";>
    <!-- very important comment -->
    <xyz> 321 </xyz>
  </foo>'::xml)
)
SELECT xmlserialize(DOCUMENT col AS text CANONICAL WITH COMMENTS) FROM t;
xmlserialize
------------------------------------------------------------------------------------------------------------------------
 <foo xmlns:ns="http://postgresql.org"; ns:a="1" ns:b="2" ns:c="3"><!-- very important comment --><xyz> 321 </xyz></foo>
(1 row)


Another option would be to simply create a new function, e.g. xmlcanonical(doc xml, keep_comments boolean), but I'm not sure if this would be the right approach.

Attached a very short draft. What do you think?

Best, Jim

1- https://www.w3.org/TR/xml-c14n11/
diff --git a/src/backend/executor/execExprInterp.c b/src/backend/executor/execExprInterp.c
index 19351fe34b..f8f10f0ed9 100644
--- a/src/backend/executor/execExprInterp.c
+++ b/src/backend/executor/execExprInterp.c
@@ -3829,6 +3829,8 @@ ExecEvalXmlExpr(ExprState *state, ExprEvalStep *op)
 			{
 				Datum	   *argvalue = op->d.xmlexpr.argvalue;
 				bool	   *argnull = op->d.xmlexpr.argnull;
+				XmlSerializeFormat	format = op->d.xmlexpr.xexpr->format;
+				text	   *data;
 
 				/* argument type is known to be xml */
 				Assert(list_length(xexpr->args) == 1);
@@ -3837,9 +3839,15 @@ ExecEvalXmlExpr(ExprState *state, ExprEvalStep *op)
 					return;
 				value = argvalue[0];
 
-				*op->resvalue = PointerGetDatum(xmltotext_with_xmloption(DatumGetXmlP(value),
-																		 xexpr->xmloption));
 				*op->resnull = false;
+
+				data = xmltotext_with_xmloption(DatumGetXmlP(value),
+												xexpr->xmloption);
+
+				if (format == XMLDEFAULT_FORMAT)
+					*op->resvalue = PointerGetDatum(data);
+				else if (format == XMLCANONICAL || format == XMLCANONICAL_WITH_COMMENTS)
+					*op->resvalue = PointerGetDatum(xmlserialize_canonical(data,format));
 			}
 			break;
 
diff --git a/src/backend/parser/gram.y b/src/backend/parser/gram.y
index a0138382a1..af5f3dfdfd 100644
--- a/src/backend/parser/gram.y
+++ b/src/backend/parser/gram.y
@@ -619,6 +619,7 @@ static Node *makeRecursiveViewSelect(char *relname, List *aliases, Node *query);
 %type <defelt>	xmltable_column_option_el
 %type <list>	xml_namespace_list
 %type <target>	xml_namespace_el
+%type <ival> 	opt_xml_serialize_format
 
 %type <node>	func_application func_expr_common_subexpr
 %type <node>	func_expr func_expr_windowless
@@ -676,7 +677,7 @@ static Node *makeRecursiveViewSelect(char *relname, List *aliases, Node *query);
 	BACKWARD BEFORE BEGIN_P BETWEEN BIGINT BINARY BIT
 	BOOLEAN_P BOTH BREADTH BY
 
-	CACHE CALL CALLED CASCADE CASCADED CASE CAST CATALOG_P CHAIN CHAR_P
+	CACHE CALL CALLED CANONICAL CASCADE CASCADED CASE CAST CATALOG_P CHAIN CHAR_P
 	CHARACTER CHARACTERISTICS CHECK CHECKPOINT CLASS CLOSE
 	CLUSTER COALESCE COLLATE COLLATION COLUMN COLUMNS COMMENT COMMENTS COMMIT
 	COMMITTED COMPRESSION CONCURRENTLY CONFIGURATION CONFLICT
@@ -15532,13 +15533,14 @@ func_expr_common_subexpr:
 					$$ = makeXmlExpr(IS_XMLROOT, NULL, NIL,
 									 list_make3($3, $5, $6), @1);
 				}
-			| XMLSERIALIZE '(' document_or_content a_expr AS SimpleTypename ')'
+			| XMLSERIALIZE '(' document_or_content a_expr AS SimpleTypename opt_xml_serialize_format ')'
 				{
 					XmlSerialize *n = makeNode(XmlSerialize);
 
 					n->xmloption = $3;
 					n->expr = $4;
 					n->typeName = $6;
+					n->format = $7;
 					n->location = @1;
 					$$ = (Node *) n;
 				}
@@ -15622,6 +15624,12 @@ xml_passing_mech:
 			| BY VALUE_P
 		;
 
+opt_xml_serialize_format:
+			CANONICAL								{ $$ = XMLCANONICAL; }
+			| CANONICAL WITH NO COMMENTS			{ $$ = XMLCANONICAL; }
+			| CANONICAL WITH COMMENTS				{ $$ = XMLCANONICAL_WITH_COMMENTS; }
+			| /*EMPTY*/								{ $$ = XMLDEFAULT_FORMAT; }
+		;
 
 /*
  * Aggregate decoration clauses
@@ -16737,6 +16745,7 @@ unreserved_keyword:
 			| CACHE
 			| CALL
 			| CALLED
+			| CANONICAL
 			| CASCADE
 			| CASCADED
 			| CATALOG_P
@@ -17259,6 +17268,7 @@ bare_label_keyword:
 			| CACHE
 			| CALL
 			| CALLED
+			| CANONICAL
 			| CASCADE
 			| CASCADED
 			| CASE
diff --git a/src/backend/parser/parse_expr.c b/src/backend/parser/parse_expr.c
index 7ff41acb84..ddfbfe259d 100644
--- a/src/backend/parser/parse_expr.c
+++ b/src/backend/parser/parse_expr.c
@@ -2332,6 +2332,7 @@ transformXmlSerialize(ParseState *pstate, XmlSerialize *xs)
 
 	xexpr->xmloption = xs->xmloption;
 	xexpr->location = xs->location;
+	xexpr->format = xs->format;
 	/* We actually only need these to be able to parse back the expression. */
 	xexpr->type = targetType;
 	xexpr->typmod = targetTypmod;
diff --git a/src/backend/utils/adt/xml.c b/src/backend/utils/adt/xml.c
index 079bcb1208..f5c4ee520c 100644
--- a/src/backend/utils/adt/xml.c
+++ b/src/backend/utils/adt/xml.c
@@ -56,6 +56,7 @@
 #include <libxml/xmlwriter.h>
 #include <libxml/xpath.h>
 #include <libxml/xpathInternals.h>
+#include <libxml/c14n.h>
 
 /*
  * We used to check for xmlStructuredErrorContext via a configure test; but
@@ -4818,3 +4819,59 @@ XmlTableDestroyOpaque(TableFuncScanState *state)
 	NO_XML_SUPPORT();
 #endif							/* not USE_LIBXML */
 }
+
+xmltype *
+xmlserialize_canonical(text *data, XmlSerializeFormat format)
+{
+#ifdef USE_LIBXML
+
+	xmlDocPtr   doc;
+	xmlChar    *xmlbuf = NULL;
+	int         nbytes;
+	StringInfoData buf;
+
+	if (format != XMLCANONICAL && format != XMLCANONICAL_WITH_COMMENTS)
+		elog(ERROR,"invalid canonical xml option");
+
+	doc = xml_parse(data, XMLOPTION_DOCUMENT, false, GetDatabaseEncoding(), NULL);
+
+	if(!doc)
+		elog(ERROR, "could not parse the given XML document");
+
+	/*
+	* int
+	* xmlC14NDocDumpMemory (
+	*   xmlDocPtr doc,                    # the XML document for canonization
+	*   xmlNodeSetPtr nodes,              # the nodes set to be included in the canonized image
+	*                                       or NULL if all document nodes should be included
+	*	 int mode,                        # 0 = Original C14N 1.0  (Outdated)
+	*	                                    1 = Exclusive C14N 1.0 (Outdated)
+	*                                       2 = C14N 1.1
+	*	 xmlChar **inclusive_ns_prefixes, # the list of inclusive namespace prefixes ended with
+	*	                                     a NULL or NULL if there is no inclusive namespaces
+	*	                                     (only for exclusive canonicalization, ignored otherwise)
+	*	 int with_comments,               # include comments in the result (!=0) or not (==0)
+	*	 xmlChar **xmlbuf                 # the memory pointer for allocated canonical XML text;
+	*	)
+	* Returns: the number of bytes written on success or a negative value on fail.
+	*/
+
+	nbytes = xmlC14NDocDumpMemory(doc, NULL, 2, NULL, format, &xmlbuf);
+
+	xmlFreeDoc(doc);
+
+	if(nbytes < 0)
+		elog(ERROR,"could not canonicalize the given XML document");
+
+	initStringInfo(&buf);
+	appendStringInfoString(&buf, (const char *) xmlbuf);
+
+	xmlFree(xmlbuf);
+
+	return stringinfo_to_xmltype(&buf);
+
+#else
+	NO_XML_SUPPORT();
+	return 0;
+#endif
+}
\ No newline at end of file
diff --git a/src/include/nodes/parsenodes.h b/src/include/nodes/parsenodes.h
index f7d7f10f7d..c75fec17ba 100644
--- a/src/include/nodes/parsenodes.h
+++ b/src/include/nodes/parsenodes.h
@@ -842,6 +842,7 @@ typedef struct XmlSerialize
 	Node	   *expr;
 	TypeName   *typeName;
 	int			location;		/* token location, or -1 if unknown */
+	XmlSerializeFormat	format;
 } XmlSerialize;
 
 /* Partitioning related definitions */
diff --git a/src/include/nodes/primnodes.h b/src/include/nodes/primnodes.h
index b4292253cc..f36d79fb14 100644
--- a/src/include/nodes/primnodes.h
+++ b/src/include/nodes/primnodes.h
@@ -1471,6 +1471,13 @@ typedef enum XmlOptionType
 	XMLOPTION_CONTENT
 } XmlOptionType;
 
+typedef enum XmlSerializeFormat
+{
+	XMLCANONICAL,
+	XMLCANONICAL_WITH_COMMENTS,
+	XMLDEFAULT_FORMAT
+} XmlSerializeFormat;
+
 typedef struct XmlExpr
 {
 	Expr		xpr;
@@ -1491,6 +1498,8 @@ typedef struct XmlExpr
 	int32		typmod pg_node_attr(query_jumble_ignore);
 	/* token location, or -1 if unknown */
 	int			location;
+	/* serialization format: XMLCANONICAL, XMLCANONICAL_WITH_COMMENTS, XMLINDENT */
+	XmlSerializeFormat format pg_node_attr(query_jumble_ignore);
 } XmlExpr;
 
 /* ----------------
diff --git a/src/include/parser/kwlist.h b/src/include/parser/kwlist.h
index bb36213e6f..c1b1a720fe 100644
--- a/src/include/parser/kwlist.h
+++ b/src/include/parser/kwlist.h
@@ -67,6 +67,7 @@ PG_KEYWORD("by", BY, UNRESERVED_KEYWORD, BARE_LABEL)
 PG_KEYWORD("cache", CACHE, UNRESERVED_KEYWORD, BARE_LABEL)
 PG_KEYWORD("call", CALL, UNRESERVED_KEYWORD, BARE_LABEL)
 PG_KEYWORD("called", CALLED, UNRESERVED_KEYWORD, BARE_LABEL)
+PG_KEYWORD("canonical", CANONICAL, UNRESERVED_KEYWORD, BARE_LABEL)
 PG_KEYWORD("cascade", CASCADE, UNRESERVED_KEYWORD, BARE_LABEL)
 PG_KEYWORD("cascaded", CASCADED, UNRESERVED_KEYWORD, BARE_LABEL)
 PG_KEYWORD("case", CASE, RESERVED_KEYWORD, BARE_LABEL)
diff --git a/src/include/utils/xml.h b/src/include/utils/xml.h
index 311da06cd6..745ebefe24 100644
--- a/src/include/utils/xml.h
+++ b/src/include/utils/xml.h
@@ -90,4 +90,5 @@ extern PGDLLIMPORT int xmloption;	/* XmlOptionType, but int for guc enum */
 
 extern PGDLLIMPORT const TableFuncRoutine XmlTableRoutine;
 
+xmltype *xmlserialize_canonical(text *data, XmlSerializeFormat format);
 #endif							/* XML_H */

Reply via email to