Hi
2017-08-19 22:53 GMT+02:00 Pavel Stehule <[email protected]>:
> Hi
>
> I am sending some POC - it does support XPATH and XMLTABLE for not UTF8
> server encoding.
>
> In this case, all strings should be converted to UTF8 before call libXML2
> functions, and result should be converted back from UTF8.
>
> I found some previous experiments https://marc.info/?l=pgsql-bugs&m=
> 123407176408688
>
> Note: I got some information so used xmlNodeDump function is deprecated -
> so we should to replace it too sometime.
>
> Regards
>
>
I forgot a debug elog in previous patch
> Pavel
>
>
>
diff --git a/src/backend/utils/adt/xml.c b/src/backend/utils/adt/xml.c
index c47624eff6..a43cf13d16 100644
--- a/src/backend/utils/adt/xml.c
+++ b/src/backend/utils/adt/xml.c
@@ -147,6 +147,7 @@ static int xml_xpathobjtoxmlarray(xmlXPathObjectPtr xpathobj,
ArrayBuildState *astate,
PgXmlErrorContext *xmlerrcxt);
static xmlChar *pg_xmlCharStrndup(char *str, size_t len);
+static xmlChar *pg_xmlCharUtf8(char *str, size_t len);
#endif /* USE_LIBXML */
static void xmldata_root_element_start(StringInfo result, const char *eltname,
@@ -459,8 +460,28 @@ cstring_to_xmltype(const char *string)
static xmltype *
xmlBuffer_to_xmltype(xmlBufferPtr buf)
{
- return (xmltype *) cstring_to_text_with_len((const char *) xmlBufferContent(buf),
+ if (GetDatabaseEncoding() != PG_UTF8)
+ {
+ char *utf8str = (char *) xmlBufferContent(buf);
+ char *str;
+ xmltype *result;
+
+ str = (char *) pg_do_encoding_conversion((unsigned char *) utf8str,
+ xmlBufferLength(buf),
+ PG_UTF8,
+ GetDatabaseEncoding());
+
+ Assert(str != utf8str);
+ result = (xmltype *) cstring_to_text(str);
+ pfree(str);
+
+ return result;
+ }
+ else
+ {
+ return (xmltype *) cstring_to_text_with_len((const char *) xmlBufferContent(buf),
xmlBufferLength(buf));
+ }
}
#endif
@@ -1176,6 +1197,28 @@ pg_xmlCharStrndup(char *str, size_t len)
}
/*
+ * LibXML2 internal encoding is UTF8. Sometimes LibXML2 enforce
+ * encoding to UTF8 by self, sometimes it expects UTF8 strings.
+ * This function is used for encoding from database encoding to
+ * UTF8.
+ */
+static xmlChar *
+pg_xmlCharUtf8(char *str, size_t len)
+{
+ char *result;
+
+ result = (char *) pg_do_encoding_conversion((unsigned char *) str,
+ len,
+ GetDatabaseEncoding(),
+ PG_UTF8);
+
+ if (result != str)
+ return BAD_CAST result;
+
+ return pg_xmlCharStrndup(str, len);
+}
+
+/*
* str is the null-terminated input string. Remaining arguments are
* output arguments; each can be NULL if value is not wanted.
* version and encoding are returned as locally-palloc'd strings.
@@ -3714,9 +3757,16 @@ xml_xmlnodetoxmltype(xmlNodePtr cur, PgXmlErrorContext *xmlerrcxt)
}
else
{
- xmlChar *str;
+ xmlChar *utf8str;
+ char *str = NULL;
+
+ utf8str = xmlXPathCastNodeToString(cur);
+
+ str = (char *) pg_do_encoding_conversion((unsigned char *) utf8str,
+ strlen((char *) utf8str),
+ PG_UTF8,
+ GetDatabaseEncoding());
- str = xmlXPathCastNodeToString(cur);
PG_TRY();
{
/* Here we rely on XML having the same representation as TEXT */
@@ -3727,11 +3777,18 @@ xml_xmlnodetoxmltype(xmlNodePtr cur, PgXmlErrorContext *xmlerrcxt)
}
PG_CATCH();
{
- xmlFree(str);
+ if (str != (char *) utf8str)
+ pfree(str);
+
+ xmlFree(utf8str);
PG_RE_THROW();
}
PG_END_TRY();
- xmlFree(str);
+
+ if (str != (char *) utf8str)
+ pfree(str);
+
+ xmlFree(utf8str);
}
return result;
@@ -3758,6 +3815,7 @@ xml_xpathobjtoxmlarray(xmlXPathObjectPtr xpathobj,
Datum datum;
Oid datumtype;
char *result_str;
+ char *str = NULL;
switch (xpathobj->type)
{
@@ -3797,7 +3855,18 @@ xml_xpathobjtoxmlarray(xmlXPathObjectPtr xpathobj,
case XPATH_STRING:
if (astate == NULL)
return 1;
- datum = CStringGetDatum((char *) xpathobj->stringval);
+
+ /*
+ * returned string is in UTF8 encoding - should be encoded
+ * to database encoding first.
+ */
+ str = (char *) pg_do_encoding_conversion((unsigned char *) xpathobj->stringval,
+ strlen((char *) xpathobj->stringval),
+ PG_UTF8,
+ GetDatabaseEncoding());
+
+ datum = CStringGetDatum(str);
+
datumtype = CSTRINGOID;
break;
@@ -3812,6 +3881,7 @@ xml_xpathobjtoxmlarray(xmlXPathObjectPtr xpathobj,
datum = PointerGetDatum(cstring_to_xmltype(result_str));
(void) accumArrayResult(astate, datum, false,
XMLOID, CurrentMemoryContext);
+
return 1;
}
@@ -3895,7 +3965,7 @@ xpath_internal(text *xpath_expr_text, xmltype *data, ArrayType *namespaces,
errmsg("empty XPath expression")));
string = pg_xmlCharStrndup(datastr, len);
- xpath_expr = pg_xmlCharStrndup(VARDATA_ANY(xpath_expr_text), xpath_len);
+ xpath_expr = pg_xmlCharUtf8(VARDATA_ANY(xpath_expr_text), xpath_len);
xmlerrcxt = pg_xml_init(PG_XML_STRICTNESS_ALL);
@@ -3911,7 +3981,9 @@ xpath_internal(text *xpath_expr_text, xmltype *data, ArrayType *namespaces,
if (ctxt == NULL || xmlerrcxt->err_occurred)
xml_ereport(xmlerrcxt, ERROR, ERRCODE_OUT_OF_MEMORY,
"could not allocate parser context");
- doc = xmlCtxtReadMemory(ctxt, (char *) string, len, NULL, NULL, 0);
+ doc = xmlCtxtReadMemory(ctxt, (char *) string, len, NULL,
+ pg_encoding_to_char(GetDatabaseEncoding()), 0);
+
if (doc == NULL || xmlerrcxt->err_occurred)
xml_ereport(xmlerrcxt, ERROR, ERRCODE_INVALID_XML_DOCUMENT,
"could not parse XML document");
@@ -3929,22 +4001,25 @@ xpath_internal(text *xpath_expr_text, xmltype *data, ArrayType *namespaces,
{
for (i = 0; i < ns_count; i++)
{
- char *ns_name;
- char *ns_uri;
+ text *ns_name;
+ text *ns_uri;
if (ns_names_uris_nulls[i * 2] ||
ns_names_uris_nulls[i * 2 + 1])
ereport(ERROR,
(errcode(ERRCODE_NULL_VALUE_NOT_ALLOWED),
errmsg("neither namespace name nor URI may be null")));
- ns_name = TextDatumGetCString(ns_names_uris[i * 2]);
- ns_uri = TextDatumGetCString(ns_names_uris[i * 2 + 1]);
+ ns_name = DatumGetTextP(ns_names_uris[i * 2]);
+ ns_uri = DatumGetTextP(ns_names_uris[i * 2 + 1]);
if (xmlXPathRegisterNs(xpathctx,
- (xmlChar *) ns_name,
- (xmlChar *) ns_uri) != 0)
+ pg_xmlCharUtf8(VARDATA_ANY(ns_name),
+ VARSIZE(ns_name) - VARHDRSZ),
+ pg_xmlCharUtf8(VARDATA_ANY(ns_uri),
+ VARSIZE(ns_uri) - VARHDRSZ)) != 0)
ereport(ERROR, /* is this an internal error??? */
(errmsg("could not register XML namespace with name \"%s\" and URI \"%s\"",
- ns_name, ns_uri)));
+ TextDatumGetCString(ns_name),
+ TextDatumGetCString(ns_uri))));
}
}
@@ -4242,18 +4317,14 @@ XmlTableSetDocument(TableFuncScanState *state, Datum value)
xtCxt = GetXmlTableBuilderPrivateData(state, "XmlTableSetDocument");
- /*
- * Use out function for casting to string (remove encoding property). See
- * comment in xml_out.
- */
- str = xml_out_internal(xmlval, 0);
-
- length = strlen(str);
+ str = VARDATA(xmlval);
+ length = VARSIZE(xmlval) - VARHDRSZ;
xstr = pg_xmlCharStrndup(str, length);
PG_TRY();
{
- doc = xmlCtxtReadMemory(xtCxt->ctxt, (char *) xstr, length, NULL, NULL, 0);
+ doc = xmlCtxtReadMemory(xtCxt->ctxt, (char *) xstr, length, NULL,
+ pg_encoding_to_char(GetDatabaseEncoding()), 0);
if (doc == NULL || xtCxt->xmlerrcxt->err_occurred)
xml_ereport(xtCxt->xmlerrcxt, ERROR, ERRCODE_INVALID_XML_DOCUMENT,
"could not parse XML document");
@@ -4301,8 +4372,8 @@ XmlTableSetNamespace(TableFuncScanState *state, char *name, char *uri)
xtCxt = GetXmlTableBuilderPrivateData(state, "XmlTableSetNamespace");
if (xmlXPathRegisterNs(xtCxt->xpathcxt,
- pg_xmlCharStrndup(name, strlen(name)),
- pg_xmlCharStrndup(uri, strlen(uri))))
+ pg_xmlCharUtf8(name, strlen(name)),
+ pg_xmlCharUtf8(uri, strlen(uri))))
xml_ereport(xtCxt->xmlerrcxt, ERROR, ERRCODE_DATA_EXCEPTION,
"could not set XML namespace");
#else
@@ -4328,7 +4399,7 @@ XmlTableSetRowFilter(TableFuncScanState *state, char *path)
(errcode(ERRCODE_DATA_EXCEPTION),
errmsg("row path filter must not be empty string")));
- xstr = pg_xmlCharStrndup(path, strlen(path));
+ xstr = pg_xmlCharUtf8(path, strlen(path));
xtCxt->xpathcomp = xmlXPathCompile(xstr);
if (xtCxt->xpathcomp == NULL || xtCxt->xmlerrcxt->err_occurred)
@@ -4359,7 +4430,7 @@ XmlTableSetColumnFilter(TableFuncScanState *state, char *path, int colnum)
(errcode(ERRCODE_DATA_EXCEPTION),
errmsg("column path filter must not be empty string")));
- xstr = pg_xmlCharStrndup(path, strlen(path));
+ xstr = pg_xmlCharUtf8(path, strlen(path));
xtCxt->xpathscomp[colnum] = xmlXPathCompile(xstr);
if (xtCxt->xpathscomp[colnum] == NULL || xtCxt->xmlerrcxt->err_occurred)
@@ -4502,7 +4573,15 @@ XmlTableGetValue(TableFuncScanState *state, int colnum,
{
PG_TRY();
{
- cstr = pstrdup((char *) str);
+ if (GetDatabaseEncoding() != PG_UTF8)
+ {
+ cstr = (char *) pg_do_encoding_conversion((unsigned char *) str,
+ strlen((char *) str),
+ PG_UTF8,
+ GetDatabaseEncoding());
+ }
+ else
+ cstr = pstrdup((char *) str);
}
PG_CATCH();
{
diff --git a/src/test/regress/expected/xml.out b/src/test/regress/expected/xml.out
index bcc585d427..6a43896d40 100644
--- a/src/test/regress/expected/xml.out
+++ b/src/test/regress/expected/xml.out
@@ -1452,3 +1452,24 @@ SELECT xmltable.* FROM xmltest2, LATERAL xmltable(('/d/r/' || lower(_path) || 'c
14
(4 rows)
+-- XML is saved in database encoding with original encoding declaration.
+-- There can be incosistency based on wrong user input, different server/client
+-- encoding or reading XML with recv function. All XML functions should to
+-- work with this partially broken XML.
+DO $$
+DECLARE str text;
+BEGIN
+ -- leave early without error, when we are not sure about result of conversion
+ IF current_setting('server_encoding') NOT IN ('UTF8', 'LATIN2') THEN return; END IF;
+
+ -- build valid UTF8 XML with broken encoding declaration
+ str = '<?xml version="1.0" encoding="windows-1250"?><enprimeur><vino><id>909</id><remark>'
+ || convert_from('\xf2', 'windows-1250')
+ || '</remark></vino></enprimeur>';
+
+ -- should to work
+ RAISE NOTICE '%', xpath('/enprimeur/vino/id', str::xml);
+ RAISE NOTICE '%', (SELECT id FROM xmltable('/enprimeur/vino' PASSING (str::xml) COLUMNS id int));
+END; $$;
+NOTICE: {<id>909</id>}
+NOTICE: 909
diff --git a/src/test/regress/sql/xml.sql b/src/test/regress/sql/xml.sql
index eb4687fb09..97a3aa9de2 100644
--- a/src/test/regress/sql/xml.sql
+++ b/src/test/regress/sql/xml.sql
@@ -558,3 +558,23 @@ INSERT INTO xmltest2 VALUES('<d><r><dc>2</dc></r></d>', 'D');
SELECT xmltable.* FROM xmltest2, LATERAL xmltable('/d/r' PASSING x COLUMNS a int PATH '' || lower(_path) || 'c');
SELECT xmltable.* FROM xmltest2, LATERAL xmltable(('/d/r/' || lower(_path) || 'c') PASSING x COLUMNS a int PATH '.');
SELECT xmltable.* FROM xmltest2, LATERAL xmltable(('/d/r/' || lower(_path) || 'c') PASSING x COLUMNS a int PATH 'x' DEFAULT ascii(_path) - 54);
+
+-- XML is saved in database encoding with original encoding declaration.
+-- There can be incosistency based on wrong user input, different server/client
+-- encoding or reading XML with recv function. All XML functions should to
+-- work with this partially broken XML.
+DO $$
+DECLARE str text;
+BEGIN
+ -- leave early without error, when we are not sure about result of conversion
+ IF current_setting('server_encoding') NOT IN ('UTF8', 'LATIN2') THEN return; END IF;
+
+ -- build valid UTF8 XML with broken encoding declaration
+ str = '<?xml version="1.0" encoding="windows-1250"?><enprimeur><vino><id>909</id><remark>'
+ || convert_from('\xf2', 'windows-1250')
+ || '</remark></vino></enprimeur>';
+
+ -- should to work
+ RAISE NOTICE '%', xpath('/enprimeur/vino/id', str::xml);
+ RAISE NOTICE '%', (SELECT id FROM xmltable('/enprimeur/vino' PASSING (str::xml) COLUMNS id int));
+END; $$;
--
Sent via pgsql-hackers mailing list ([email protected])
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers