Hi,

The chromium team have recently detected a fuzz-testing bug in libxml / ICU
where UTF8 chars can be decoded incorrectly.  See http://crbug.com/722420.

The root cause of this problem is that libxml is calling ICU ucnv_convertEx
with incorrect params.  It is always setting flush to TRUE.  This param
should only be set to true for the last call when reading an input.

Also, when calling ucnv_convertEx multiple times (with flush=FALSE), the
caller must provide a pivot buffer which is maintained between calls.

This patch fixes those issues.  The patch includes test/icu_parse_test.xml
to reproduce the error.

./configure --with-icu --with-iconv=no
make runtest
./runtest

The test sets encoding to "UTF8-".  This encoding is chosen since it is not
recognized by libxml and forces the decoding to be done by ICU which
recognizes this encoding as UTF8.  Unfortunately, the test always fails
when using iconv since iconv does not recognize this encoding.  Since iconv
is the default in libxml, I understand that it may not make sense to always
run this testcase, or to include it at all.

This patch has been reviewed by Markus Scherer (maintainer of ICU), and
Jungshik in the chromium team who worked on the original integration of
libxml and ICU in 2007.  http://crosreview.com/729616.  The patch is ready
to submit against the chromium local copy of libxml, but it is preferable
to have this accepted into libxml if you are happy with it, and we can then
take the patch from libxml and stay in sync.

If it is any help, I can send this patch as a pull request to the libxml
github repo, or I can also create a libxml bug.  It looks like libxml
preference is to take patches on the mailing list.

Thanks,
Joel
diff --git a/encoding.c b/encoding.c
index cd019c51..617e1ed7 100644
--- a/encoding.c
+++ b/encoding.c
@@ -110,6 +110,9 @@ openIcuConverter(const char* name, int toUnicode)
   if (conv == NULL)
     return NULL;
 
+  conv->pivot_source = conv->pivot_buf;
+  conv->pivot_target = conv->pivot_buf;
+
   conv->uconv = ucnv_open(name, &status);
   if (U_FAILURE(status))
     goto error;
@@ -1850,6 +1853,7 @@ xmlIconvWrapper(iconv_t cd, unsigned char *out, int *outlen,
  * @outlen:  the length of @out
  * @in:  a pointer to an array of ISO Latin 1 chars
  * @inlen:  the length of @in
+ * @flush: if true, indicates end of input
  *
  * Returns 0 if success, or
  *     -1 by lack of space, or
@@ -1863,7 +1867,7 @@ xmlIconvWrapper(iconv_t cd, unsigned char *out, int *outlen,
  */
 static int
 xmlUconvWrapper(uconv_t *cd, int toUnicode, unsigned char *out, int *outlen,
-                const unsigned char *in, int *inlen) {
+                const unsigned char *in, int *inlen, int flush) {
     const char *ucv_in = (const char *) in;
     char *ucv_out = (char *) out;
     UErrorCode err = U_ZERO_ERROR;
@@ -1873,33 +1877,30 @@ xmlUconvWrapper(uconv_t *cd, int toUnicode, unsigned char *out, int *outlen,
         return(-1);
     }
 
-    /*
-     * TODO(jungshik)
-     * 1. is ucnv_convert(To|From)Algorithmic better?
-     * 2. had we better use an explicit pivot buffer?
-     * 3. error returned comes from 'fromUnicode' only even
-     *    when toUnicode is true !
-     */
     if (toUnicode) {
         /* encoding => UTF-16 => UTF-8 */
         ucnv_convertEx(cd->utf8, cd->uconv, &ucv_out, ucv_out + *outlen,
-                       &ucv_in, ucv_in + *inlen, NULL, NULL, NULL, NULL,
-                       0, TRUE, &err);
+                       &ucv_in, ucv_in + *inlen, cd->pivot_buf,
+                       &cd->pivot_source, &cd->pivot_target,
+                       cd->pivot_buf + ICU_PIVOT_BUF_SIZE, 0, flush, &err);
     } else {
         /* UTF-8 => UTF-16 => encoding */
         ucnv_convertEx(cd->uconv, cd->utf8, &ucv_out, ucv_out + *outlen,
-                       &ucv_in, ucv_in + *inlen, NULL, NULL, NULL, NULL,
-                       0, TRUE, &err);
+                       &ucv_in, ucv_in + *inlen, cd->pivot_buf,
+                       &cd->pivot_source, &cd->pivot_target,
+                       cd->pivot_buf + ICU_PIVOT_BUF_SIZE, 0, flush, &err);
     }
     *inlen = ucv_in - (const char*) in;
     *outlen = ucv_out - (char *) out;
+    /* reset pivot buffer if this is the last call for input (flush==TRUE) */
+    if (flush)
+        cd->pivot_source = cd->pivot_target = cd->pivot_buf;
     if (U_SUCCESS(err))
         return 0;
     if (err == U_BUFFER_OVERFLOW_ERROR)
         return -1;
     if (err == U_INVALID_CHAR_FOUND || err == U_ILLEGAL_CHAR_FOUND)
         return -2;
-    /* if (err == U_TRUNCATED_CHAR_FOUND) */
     return -3;
 }
 #endif /* LIBXML_ICU_ENABLED */
@@ -1912,7 +1913,7 @@ xmlUconvWrapper(uconv_t *cd, int toUnicode, unsigned char *out, int *outlen,
 
 static int
 xmlEncInputChunk(xmlCharEncodingHandler *handler, unsigned char *out,
-                 int *outlen, const unsigned char *in, int *inlen) {
+                 int *outlen, const unsigned char *in, int *inlen, int flush) {
     int ret;
 
     if (handler->input != NULL) {
@@ -1925,7 +1926,8 @@ xmlEncInputChunk(xmlCharEncodingHandler *handler, unsigned char *out,
 #endif /* LIBXML_ICONV_ENABLED */
 #ifdef LIBXML_ICU_ENABLED
     else if (handler->uconv_in != NULL) {
-        ret = xmlUconvWrapper(handler->uconv_in, 1, out, outlen, in, inlen);
+        ret = xmlUconvWrapper(handler->uconv_in, 1, out, outlen, in, inlen,
+                              flush);
     }
 #endif /* LIBXML_ICU_ENABLED */
     else {
@@ -1953,7 +1955,8 @@ xmlEncOutputChunk(xmlCharEncodingHandler *handler, unsigned char *out,
 #endif /* LIBXML_ICONV_ENABLED */
 #ifdef LIBXML_ICU_ENABLED
     else if (handler->uconv_out != NULL) {
-        ret = xmlUconvWrapper(handler->uconv_out, 0, out, outlen, in, inlen);
+        ret = xmlUconvWrapper(handler->uconv_out, 0, out, outlen, in, inlen,
+                              TRUE);
     }
 #endif /* LIBXML_ICU_ENABLED */
     else {
@@ -2015,7 +2018,7 @@ xmlCharEncFirstLineInt(xmlCharEncodingHandler *handler, xmlBufferPtr out,
     }
 
     ret = xmlEncInputChunk(handler, &out->content[out->use], &written,
-                           in->content, &toconv);
+                           in->content, &toconv, 0);
     xmlBufferShrink(in, toconv);
     out->use += written;
     out->content[out->use] = 0;
@@ -2133,7 +2136,7 @@ xmlCharEncFirstLineInput(xmlParserInputBufferPtr input, int len)
     c_in = toconv;
     c_out = written;
     ret = xmlEncInputChunk(input->encoder, xmlBufEnd(out), &c_out,
-                           xmlBufContent(in), &c_in);
+                           xmlBufContent(in), &c_in, 0);
     xmlBufShrink(in, c_in);
     xmlBufAddLen(out, c_out);
     if (ret == -1)
@@ -2231,7 +2234,7 @@ xmlCharEncInput(xmlParserInputBufferPtr input, int flush)
     c_in = toconv;
     c_out = written;
     ret = xmlEncInputChunk(input->encoder, xmlBufEnd(out), &c_out,
-                           xmlBufContent(in), &c_in);
+                           xmlBufContent(in), &c_in, flush);
     xmlBufShrink(in, c_in);
     xmlBufAddLen(out, c_out);
     if (ret == -1)
@@ -2285,6 +2288,7 @@ xmlCharEncInput(xmlParserInputBufferPtr input, int flush)
  * @handler:	char encoding transformation data structure
  * @out:  an xmlBuffer for the output.
  * @in:  an xmlBuffer for the input
+ * @flush: if true, indicates end of input.
  *
  * Generic front-end for the encoding handler input function
  *
@@ -2295,7 +2299,7 @@ xmlCharEncInput(xmlParserInputBufferPtr input, int flush)
  */
 int
 xmlCharEncInFunc(xmlCharEncodingHandler * handler, xmlBufferPtr out,
-                 xmlBufferPtr in)
+                 xmlBufferPtr in, int flush)
 {
     int ret;
     int written;
@@ -2317,7 +2321,7 @@ xmlCharEncInFunc(xmlCharEncodingHandler * handler, xmlBufferPtr out,
         written = out->size - out->use - 1;
     }
     ret = xmlEncInputChunk(handler, &out->content[out->use], &written,
-                           in->content, &toconv);
+                           in->content, &toconv, flush);
     xmlBufferShrink(in, toconv);
     out->use += written;
     out->content[out->use] = 0;
diff --git a/include/libxml/encoding.h b/include/libxml/encoding.h
index 7967cc66..e3ad1950 100644
--- a/include/libxml/encoding.h
+++ b/include/libxml/encoding.h
@@ -129,9 +129,14 @@ typedef int (* xmlCharEncodingOutputFunc)(unsigned char *out, int *outlen,
  * If iconv is supported, there are two extra fields.
  */
 #ifdef LIBXML_ICU_ENABLED
+/* Size of pivot buffer, same as icu/source/common/ucnv.cpp CHUNK_SIZE */
+#define ICU_PIVOT_BUF_SIZE 1024
 struct _uconv_t {
   UConverter *uconv; /* for conversion between an encoding and UTF-16 */
   UConverter *utf8; /* for conversion between UTF-8 and UTF-16 */
+  UChar      pivot_buf[ICU_PIVOT_BUF_SIZE];
+  UChar      *pivot_source;
+  UChar      *pivot_target;
 };
 typedef struct _uconv_t uconv_t;
 #endif
@@ -210,7 +215,8 @@ XMLPUBFUN int XMLCALL
 XMLPUBFUN int XMLCALL
 	xmlCharEncInFunc		(xmlCharEncodingHandler *handler,
 					 xmlBufferPtr out,
-					 xmlBufferPtr in);
+					 xmlBufferPtr in,
+					 int flush);
 XMLPUBFUN int XMLCALL
 	xmlCharEncFirstLine		(xmlCharEncodingHandler *handler,
 					 xmlBufferPtr out,
diff --git a/result/icu_parse_test.xml b/result/icu_parse_test.xml
new file mode 100644
index 00000000..e7163232
--- /dev/null
+++ b/result/icu_parse_test.xml
@@ -0,0 +1,13 @@
+<?xml version="1.0" encoding="UTF8-"?>
+<foo>
+Text with UTF8 chars at position 214 (0xd6)
+______
+_______________
+_______________
+_______________
+_______________
+_______________
+_______________
+_______________
+_______駪槗___
+_</foo>
diff --git a/result/icu_parse_test.xml.rde b/result/icu_parse_test.xml.rde
new file mode 100644
index 00000000..6af31937
--- /dev/null
+++ b/result/icu_parse_test.xml.rde
@@ -0,0 +1,14 @@
+0 1 foo 0 0
+1 3 #text 0 1 
+Text with UTF8 chars at position 214 (0xd6)
+______
+_______________
+_______________
+_______________
+_______________
+_______________
+_______________
+_______________
+_______駪槗___
+_
+0 15 foo 0 0
diff --git a/result/icu_parse_test.xml.rdr b/result/icu_parse_test.xml.rdr
new file mode 100644
index 00000000..6af31937
--- /dev/null
+++ b/result/icu_parse_test.xml.rdr
@@ -0,0 +1,14 @@
+0 1 foo 0 0
+1 3 #text 0 1 
+Text with UTF8 chars at position 214 (0xd6)
+______
+_______________
+_______________
+_______________
+_______________
+_______________
+_______________
+_______________
+_______駪槗___
+_
+0 15 foo 0 0
diff --git a/result/icu_parse_test.xml.sax b/result/icu_parse_test.xml.sax
new file mode 100644
index 00000000..1ac3a072
--- /dev/null
+++ b/result/icu_parse_test.xml.sax
@@ -0,0 +1,9 @@
+SAX.setDocumentLocator()
+SAX.startDocument()
+SAX.startElement(foo)
+SAX.characters(
+Text with UTF8 chars at posit, 171)
+SAX.characters(駪槗___
+_, 9)
+SAX.endElement(foo)
+SAX.endDocument()
diff --git a/result/icu_parse_test.xml.sax2 b/result/icu_parse_test.xml.sax2
new file mode 100644
index 00000000..97294a1b
--- /dev/null
+++ b/result/icu_parse_test.xml.sax2
@@ -0,0 +1,9 @@
+SAX.setDocumentLocator()
+SAX.startDocument()
+SAX.startElementNs(foo, NULL, NULL, 0, 0, 0)
+SAX.characters(
+Text with UTF8 chars at posit, 171)
+SAX.characters(駪槗___
+_, 9)
+SAX.endElementNs(foo, NULL, NULL)
+SAX.endDocument()
diff --git a/result/noent/icu_parse_test.xml b/result/noent/icu_parse_test.xml
new file mode 100644
index 00000000..e7163232
--- /dev/null
+++ b/result/noent/icu_parse_test.xml
@@ -0,0 +1,13 @@
+<?xml version="1.0" encoding="UTF8-"?>
+<foo>
+Text with UTF8 chars at position 214 (0xd6)
+______
+_______________
+_______________
+_______________
+_______________
+_______________
+_______________
+_______________
+_______駪槗___
+_</foo>
diff --git a/result/noent/icu_parse_test.xml.sax2 b/result/noent/icu_parse_test.xml.sax2
new file mode 100644
index 00000000..97294a1b
--- /dev/null
+++ b/result/noent/icu_parse_test.xml.sax2
@@ -0,0 +1,9 @@
+SAX.setDocumentLocator()
+SAX.startDocument()
+SAX.startElementNs(foo, NULL, NULL, 0, 0, 0)
+SAX.characters(
+Text with UTF8 chars at posit, 171)
+SAX.characters(駪槗___
+_, 9)
+SAX.endElementNs(foo, NULL, NULL)
+SAX.endDocument()
diff --git a/test/icu_parse_test.xml b/test/icu_parse_test.xml
new file mode 100644
index 00000000..e7163232
--- /dev/null
+++ b/test/icu_parse_test.xml
@@ -0,0 +1,13 @@
+<?xml version="1.0" encoding="UTF8-"?>
+<foo>
+Text with UTF8 chars at position 214 (0xd6)
+______
+_______________
+_______________
+_______________
+_______________
+_______________
+_______________
+_______________
+_______駪槗___
+_</foo>
diff --git a/testapi.c b/testapi.c
index 168ceb67..ef3f4733 100644
--- a/testapi.c
+++ b/testapi.c
@@ -8785,7 +8785,7 @@ test_xmlCharEncInFunc(void) {
         out = gen_xmlBufferPtr(n_out, 1);
         in = gen_xmlBufferPtr(n_in, 2);
 
-        ret_val = xmlCharEncInFunc(handler, out, in);
+        ret_val = xmlCharEncInFunc(handler, out, in, 1);
         desret_int(ret_val);
         call_tests++;
         des_xmlCharEncodingHandler_ptr(n_handler, handler, 0);
_______________________________________________
xml mailing list, project page  http://xmlsoft.org/
xml@gnome.org
https://mail.gnome.org/mailman/listinfo/xml

Reply via email to