Hi,
In order to compare pairs of XML documents for equivalence it is
necessary to convert them first to their canonical form, as described at
W3C Canonical XML 1.1.[1] This spec basically defines a standard
physical representation of xml documents that have more then one
possible representation, so that it is possible to compare them, e.g.
forcing UTF-8 encoding, entity reference replacement, attributes
normalization, etc.
Although it is not part of the XML/SQL standard, it would be nice to
have the option CANONICAL in xmlserialize. Additionally, we could also
add the attribute WITH [NO] COMMENTS to keep or remove xml comments from
the documents.
Something like this:
WITH t(col) AS (
VALUES
('<?xml version="1.0" encoding="ISO-8859-1"?>
<!DOCTYPE doc SYSTEM "doc.dtd" [
<!ENTITY val "42">
<!ATTLIST xyz attr CDATA "default">
]>
<!-- ordering of attributes -->
<foo ns:c = "3" ns:b = "2" ns:a = "1"
xmlns:ns="http://postgresql.org">
<!-- Normalization of whitespace in start and end tags -->
<!-- Elimination of superfluous namespace declarations,
as already declared in <foo> -->
<bar xmlns:ns="http://postgresql.org" >&val;</bar >
<!-- Empty element conversion to start-end tag pair -->
<empty/>
<!-- Effect of transcoding from a sample encoding to UTF-8 -->
<iso8859>©</iso8859>
<!-- Addition of default attribute -->
<!-- Whitespace inside tag preserved -->
<xyz> 321 </xyz>
</foo>
<!-- comment outside doc -->'::xml)
)
SELECT xmlserialize(DOCUMENT col AS text CANONICAL) FROM t;
xmlserialize
--------------------------------------------------------------------------------------------------------------------------------------------------------
<foo xmlns:ns="http://postgresql.org" ns:a="1" ns:b="2"
ns:c="3"><bar>42</bar><empty></empty><iso8859>©</iso8859><xyz
attr="default"> 321 </xyz></foo>
(1 row)
-- using WITH COMMENTS
WITH t(col) AS (
VALUES
(' <foo ns:c = "3" ns:b = "2" ns:a = "1"
xmlns:ns="http://postgresql.org">
<!-- very important comment -->
<xyz> 321 </xyz>
</foo>'::xml)
)
SELECT xmlserialize(DOCUMENT col AS text CANONICAL WITH COMMENTS) FROM t;
xmlserialize
------------------------------------------------------------------------------------------------------------------------
<foo xmlns:ns="http://postgresql.org" ns:a="1" ns:b="2" ns:c="3"><!--
very important comment --><xyz> 321 </xyz></foo>
(1 row)
Another option would be to simply create a new function, e.g.
xmlcanonical(doc xml, keep_comments boolean), but I'm not sure if this
would be the right approach.
Attached a very short draft. What do you think?
Best, Jim
1- https://www.w3.org/TR/xml-c14n11/
diff --git a/src/backend/executor/execExprInterp.c b/src/backend/executor/execExprInterp.c
index 19351fe34b..f8f10f0ed9 100644
--- a/src/backend/executor/execExprInterp.c
+++ b/src/backend/executor/execExprInterp.c
@@ -3829,6 +3829,8 @@ ExecEvalXmlExpr(ExprState *state, ExprEvalStep *op)
{
Datum *argvalue = op->d.xmlexpr.argvalue;
bool *argnull = op->d.xmlexpr.argnull;
+ XmlSerializeFormat format = op->d.xmlexpr.xexpr->format;
+ text *data;
/* argument type is known to be xml */
Assert(list_length(xexpr->args) == 1);
@@ -3837,9 +3839,15 @@ ExecEvalXmlExpr(ExprState *state, ExprEvalStep *op)
return;
value = argvalue[0];
- *op->resvalue = PointerGetDatum(xmltotext_with_xmloption(DatumGetXmlP(value),
- xexpr->xmloption));
*op->resnull = false;
+
+ data = xmltotext_with_xmloption(DatumGetXmlP(value),
+ xexpr->xmloption);
+
+ if (format == XMLDEFAULT_FORMAT)
+ *op->resvalue = PointerGetDatum(data);
+ else if (format == XMLCANONICAL || format == XMLCANONICAL_WITH_COMMENTS)
+ *op->resvalue = PointerGetDatum(xmlserialize_canonical(data,format));
}
break;
diff --git a/src/backend/parser/gram.y b/src/backend/parser/gram.y
index a0138382a1..af5f3dfdfd 100644
--- a/src/backend/parser/gram.y
+++ b/src/backend/parser/gram.y
@@ -619,6 +619,7 @@ static Node *makeRecursiveViewSelect(char *relname, List *aliases, Node *query);
%type <defelt> xmltable_column_option_el
%type <list> xml_namespace_list
%type <target> xml_namespace_el
+%type <ival> opt_xml_serialize_format
%type <node> func_application func_expr_common_subexpr
%type <node> func_expr func_expr_windowless
@@ -676,7 +677,7 @@ static Node *makeRecursiveViewSelect(char *relname, List *aliases, Node *query);
BACKWARD BEFORE BEGIN_P BETWEEN BIGINT BINARY BIT
BOOLEAN_P BOTH BREADTH BY
- CACHE CALL CALLED CASCADE CASCADED CASE CAST CATALOG_P CHAIN CHAR_P
+ CACHE CALL CALLED CANONICAL CASCADE CASCADED CASE CAST CATALOG_P CHAIN CHAR_P
CHARACTER CHARACTERISTICS CHECK CHECKPOINT CLASS CLOSE
CLUSTER COALESCE COLLATE COLLATION COLUMN COLUMNS COMMENT COMMENTS COMMIT
COMMITTED COMPRESSION CONCURRENTLY CONFIGURATION CONFLICT
@@ -15532,13 +15533,14 @@ func_expr_common_subexpr:
$$ = makeXmlExpr(IS_XMLROOT, NULL, NIL,
list_make3($3, $5, $6), @1);
}
- | XMLSERIALIZE '(' document_or_content a_expr AS SimpleTypename ')'
+ | XMLSERIALIZE '(' document_or_content a_expr AS SimpleTypename opt_xml_serialize_format ')'
{
XmlSerialize *n = makeNode(XmlSerialize);
n->xmloption = $3;
n->expr = $4;
n->typeName = $6;
+ n->format = $7;
n->location = @1;
$$ = (Node *) n;
}
@@ -15622,6 +15624,12 @@ xml_passing_mech:
| BY VALUE_P
;
+opt_xml_serialize_format:
+ CANONICAL { $$ = XMLCANONICAL; }
+ | CANONICAL WITH NO COMMENTS { $$ = XMLCANONICAL; }
+ | CANONICAL WITH COMMENTS { $$ = XMLCANONICAL_WITH_COMMENTS; }
+ | /*EMPTY*/ { $$ = XMLDEFAULT_FORMAT; }
+ ;
/*
* Aggregate decoration clauses
@@ -16737,6 +16745,7 @@ unreserved_keyword:
| CACHE
| CALL
| CALLED
+ | CANONICAL
| CASCADE
| CASCADED
| CATALOG_P
@@ -17259,6 +17268,7 @@ bare_label_keyword:
| CACHE
| CALL
| CALLED
+ | CANONICAL
| CASCADE
| CASCADED
| CASE
diff --git a/src/backend/parser/parse_expr.c b/src/backend/parser/parse_expr.c
index 7ff41acb84..ddfbfe259d 100644
--- a/src/backend/parser/parse_expr.c
+++ b/src/backend/parser/parse_expr.c
@@ -2332,6 +2332,7 @@ transformXmlSerialize(ParseState *pstate, XmlSerialize *xs)
xexpr->xmloption = xs->xmloption;
xexpr->location = xs->location;
+ xexpr->format = xs->format;
/* We actually only need these to be able to parse back the expression. */
xexpr->type = targetType;
xexpr->typmod = targetTypmod;
diff --git a/src/backend/utils/adt/xml.c b/src/backend/utils/adt/xml.c
index 079bcb1208..f5c4ee520c 100644
--- a/src/backend/utils/adt/xml.c
+++ b/src/backend/utils/adt/xml.c
@@ -56,6 +56,7 @@
#include <libxml/xmlwriter.h>
#include <libxml/xpath.h>
#include <libxml/xpathInternals.h>
+#include <libxml/c14n.h>
/*
* We used to check for xmlStructuredErrorContext via a configure test; but
@@ -4818,3 +4819,59 @@ XmlTableDestroyOpaque(TableFuncScanState *state)
NO_XML_SUPPORT();
#endif /* not USE_LIBXML */
}
+
+xmltype *
+xmlserialize_canonical(text *data, XmlSerializeFormat format)
+{
+#ifdef USE_LIBXML
+
+ xmlDocPtr doc;
+ xmlChar *xmlbuf = NULL;
+ int nbytes;
+ StringInfoData buf;
+
+ if (format != XMLCANONICAL && format != XMLCANONICAL_WITH_COMMENTS)
+ elog(ERROR,"invalid canonical xml option");
+
+ doc = xml_parse(data, XMLOPTION_DOCUMENT, false, GetDatabaseEncoding(), NULL);
+
+ if(!doc)
+ elog(ERROR, "could not parse the given XML document");
+
+ /*
+ * int
+ * xmlC14NDocDumpMemory (
+ * xmlDocPtr doc, # the XML document for canonization
+ * xmlNodeSetPtr nodes, # the nodes set to be included in the canonized image
+ * or NULL if all document nodes should be included
+ * int mode, # 0 = Original C14N 1.0 (Outdated)
+ * 1 = Exclusive C14N 1.0 (Outdated)
+ * 2 = C14N 1.1
+ * xmlChar **inclusive_ns_prefixes, # the list of inclusive namespace prefixes ended with
+ * a NULL or NULL if there is no inclusive namespaces
+ * (only for exclusive canonicalization, ignored otherwise)
+ * int with_comments, # include comments in the result (!=0) or not (==0)
+ * xmlChar **xmlbuf # the memory pointer for allocated canonical XML text;
+ * )
+ * Returns: the number of bytes written on success or a negative value on fail.
+ */
+
+ nbytes = xmlC14NDocDumpMemory(doc, NULL, 2, NULL, format, &xmlbuf);
+
+ xmlFreeDoc(doc);
+
+ if(nbytes < 0)
+ elog(ERROR,"could not canonicalize the given XML document");
+
+ initStringInfo(&buf);
+ appendStringInfoString(&buf, (const char *) xmlbuf);
+
+ xmlFree(xmlbuf);
+
+ return stringinfo_to_xmltype(&buf);
+
+#else
+ NO_XML_SUPPORT();
+ return 0;
+#endif
+}
\ No newline at end of file
diff --git a/src/include/nodes/parsenodes.h b/src/include/nodes/parsenodes.h
index f7d7f10f7d..c75fec17ba 100644
--- a/src/include/nodes/parsenodes.h
+++ b/src/include/nodes/parsenodes.h
@@ -842,6 +842,7 @@ typedef struct XmlSerialize
Node *expr;
TypeName *typeName;
int location; /* token location, or -1 if unknown */
+ XmlSerializeFormat format;
} XmlSerialize;
/* Partitioning related definitions */
diff --git a/src/include/nodes/primnodes.h b/src/include/nodes/primnodes.h
index b4292253cc..f36d79fb14 100644
--- a/src/include/nodes/primnodes.h
+++ b/src/include/nodes/primnodes.h
@@ -1471,6 +1471,13 @@ typedef enum XmlOptionType
XMLOPTION_CONTENT
} XmlOptionType;
+typedef enum XmlSerializeFormat
+{
+ XMLCANONICAL,
+ XMLCANONICAL_WITH_COMMENTS,
+ XMLDEFAULT_FORMAT
+} XmlSerializeFormat;
+
typedef struct XmlExpr
{
Expr xpr;
@@ -1491,6 +1498,8 @@ typedef struct XmlExpr
int32 typmod pg_node_attr(query_jumble_ignore);
/* token location, or -1 if unknown */
int location;
+ /* serialization format: XMLCANONICAL, XMLCANONICAL_WITH_COMMENTS, XMLINDENT */
+ XmlSerializeFormat format pg_node_attr(query_jumble_ignore);
} XmlExpr;
/* ----------------
diff --git a/src/include/parser/kwlist.h b/src/include/parser/kwlist.h
index bb36213e6f..c1b1a720fe 100644
--- a/src/include/parser/kwlist.h
+++ b/src/include/parser/kwlist.h
@@ -67,6 +67,7 @@ PG_KEYWORD("by", BY, UNRESERVED_KEYWORD, BARE_LABEL)
PG_KEYWORD("cache", CACHE, UNRESERVED_KEYWORD, BARE_LABEL)
PG_KEYWORD("call", CALL, UNRESERVED_KEYWORD, BARE_LABEL)
PG_KEYWORD("called", CALLED, UNRESERVED_KEYWORD, BARE_LABEL)
+PG_KEYWORD("canonical", CANONICAL, UNRESERVED_KEYWORD, BARE_LABEL)
PG_KEYWORD("cascade", CASCADE, UNRESERVED_KEYWORD, BARE_LABEL)
PG_KEYWORD("cascaded", CASCADED, UNRESERVED_KEYWORD, BARE_LABEL)
PG_KEYWORD("case", CASE, RESERVED_KEYWORD, BARE_LABEL)
diff --git a/src/include/utils/xml.h b/src/include/utils/xml.h
index 311da06cd6..745ebefe24 100644
--- a/src/include/utils/xml.h
+++ b/src/include/utils/xml.h
@@ -90,4 +90,5 @@ extern PGDLLIMPORT int xmloption; /* XmlOptionType, but int for guc enum */
extern PGDLLIMPORT const TableFuncRoutine XmlTableRoutine;
+xmltype *xmlserialize_canonical(text *data, XmlSerializeFormat format);
#endif /* XML_H */