I've updated the patch using git format-patch.

* reverted public interface xmlCharEncInFunc.  It now calls
xmlEncInputChunk with flush=1 on all calls as suggested.
Always setting flush=TRUE here makes sense since this is a one-shot
conversion of the entire buffer.

* Moved the pivot buf reset to only happen on success (suggestion from
Markus).

* removed test/icu_parse_test.xml from patch.  This file is attached
separately to this email.
The original bug found by fuzzer only relates to UTF8 decoding, so using
Shift-JIS or anything else wont help.  I can't think of any way that this
test can simultaneously
 - be effective to test the bug when libxml is compiled with icu
 - still parse correctly and pass when libxml is compiled with iconv
The trick is that we need an encoding which will:
 - not be handled natively by libxml
 - be recognized by icu as UTF8
 - be recognized by iconv as UTF8


On Thu, Oct 26, 2017 at 3:21 AM, Nick Wellnhofer <wellnho...@aevum.de>
wrote:

> On 25/10/2017 17:40, Markus Scherer wrote:
>
>> On Wed, Oct 25, 2017 at 4:02 AM, Nick Wellnhofer <wellnho...@aevum.de
>>  The patch changes public function xmlCharEncInFunc but this function isn't
>>     used internally anymore (since commit a78d8036 from 2012). It might
>> still
>>     be used in client code that wants to use libxml2's character
>> conversion
>>     facilities, though. Maybe it's better to remove the `flush` parameter
>> and
>>     always call xmlEncInputChunk with `flush` set to 1. This should at
>> least
>>     allow one-shot character conversions without breaking the public API.
>>
>> The ICU conversion functions must be called with flush=FALSE before the
>> end of the stream, and with flush=TRUE at the end of the stream.
>>
>
> Yes, but I'm only talking about xmlCharEncInFunc which isn't used
> internally.
>
> Nick
>
>
From f495b5546927032fb5b3988d66949d3d1b735aa9 Mon Sep 17 00:00:00 2001
From: Joel Hockey <joel.hoc...@gmail.com>
Date: Wed, 25 Oct 2017 18:11:12 -0700
Subject: [PATCH] Fixed ICU to set flush correctly and provide pivot buffer.

By always setting flush=TRUE when doing multiple reads, ICU
will not correctly handle truncated utf8 chars across read
boundaries.

The fix is to set flush=TRUE only on final read, and to
provide a pivot buffer which is maintained by libxml
between calls to ucnv_convertEx.
---
 encoding.c                | 46 +++++++++++++++++++++++++---------------------
 include/libxml/encoding.h |  5 +++++
 testapi.c                 |  2 +-
 3 files changed, 31 insertions(+), 22 deletions(-)

diff --git a/encoding.c b/encoding.c
index cd019c51..5dac8a14 100644
--- a/encoding.c
+++ b/encoding.c
@@ -110,6 +110,9 @@ openIcuConverter(const char* name, int toUnicode)
   if (conv == NULL)
     return NULL;
 
+  conv->pivot_source = conv->pivot_buf;
+  conv->pivot_target = conv->pivot_buf;
+
   conv->uconv = ucnv_open(name, &status);
   if (U_FAILURE(status))
     goto error;
@@ -1850,6 +1853,7 @@ xmlIconvWrapper(iconv_t cd, unsigned char *out, int *outlen,
  * @outlen:  the length of @out
  * @in:  a pointer to an array of ISO Latin 1 chars
  * @inlen:  the length of @in
+ * @flush: if true, indicates end of input
  *
  * Returns 0 if success, or
  *     -1 by lack of space, or
@@ -1863,7 +1867,7 @@ xmlIconvWrapper(iconv_t cd, unsigned char *out, int *outlen,
  */
 static int
 xmlUconvWrapper(uconv_t *cd, int toUnicode, unsigned char *out, int *outlen,
-                const unsigned char *in, int *inlen) {
+                const unsigned char *in, int *inlen, int flush) {
     const char *ucv_in = (const char *) in;
     char *ucv_out = (char *) out;
     UErrorCode err = U_ZERO_ERROR;
@@ -1873,33 +1877,31 @@ xmlUconvWrapper(uconv_t *cd, int toUnicode, unsigned char *out, int *outlen,
         return(-1);
     }
 
-    /*
-     * TODO(jungshik)
-     * 1. is ucnv_convert(To|From)Algorithmic better?
-     * 2. had we better use an explicit pivot buffer?
-     * 3. error returned comes from 'fromUnicode' only even
-     *    when toUnicode is true !
-     */
     if (toUnicode) {
         /* encoding => UTF-16 => UTF-8 */
         ucnv_convertEx(cd->utf8, cd->uconv, &ucv_out, ucv_out + *outlen,
-                       &ucv_in, ucv_in + *inlen, NULL, NULL, NULL, NULL,
-                       0, TRUE, &err);
+                       &ucv_in, ucv_in + *inlen, cd->pivot_buf,
+                       &cd->pivot_source, &cd->pivot_target,
+                       cd->pivot_buf + ICU_PIVOT_BUF_SIZE, 0, flush, &err);
     } else {
         /* UTF-8 => UTF-16 => encoding */
         ucnv_convertEx(cd->uconv, cd->utf8, &ucv_out, ucv_out + *outlen,
-                       &ucv_in, ucv_in + *inlen, NULL, NULL, NULL, NULL,
-                       0, TRUE, &err);
+                       &ucv_in, ucv_in + *inlen, cd->pivot_buf,
+                       &cd->pivot_source, &cd->pivot_target,
+                       cd->pivot_buf + ICU_PIVOT_BUF_SIZE, 0, flush, &err);
     }
     *inlen = ucv_in - (const char*) in;
     *outlen = ucv_out - (char *) out;
-    if (U_SUCCESS(err))
+    if (U_SUCCESS(err)) {
+        /* reset pivot buf if this is the last call for input (flush==TRUE) */
+        if (flush)
+            cd->pivot_source = cd->pivot_target = cd->pivot_buf;
         return 0;
+    }
     if (err == U_BUFFER_OVERFLOW_ERROR)
         return -1;
     if (err == U_INVALID_CHAR_FOUND || err == U_ILLEGAL_CHAR_FOUND)
         return -2;
-    /* if (err == U_TRUNCATED_CHAR_FOUND) */
     return -3;
 }
 #endif /* LIBXML_ICU_ENABLED */
@@ -1912,7 +1914,7 @@ xmlUconvWrapper(uconv_t *cd, int toUnicode, unsigned char *out, int *outlen,
 
 static int
 xmlEncInputChunk(xmlCharEncodingHandler *handler, unsigned char *out,
-                 int *outlen, const unsigned char *in, int *inlen) {
+                 int *outlen, const unsigned char *in, int *inlen, int flush) {
     int ret;
 
     if (handler->input != NULL) {
@@ -1925,7 +1927,8 @@ xmlEncInputChunk(xmlCharEncodingHandler *handler, unsigned char *out,
 #endif /* LIBXML_ICONV_ENABLED */
 #ifdef LIBXML_ICU_ENABLED
     else if (handler->uconv_in != NULL) {
-        ret = xmlUconvWrapper(handler->uconv_in, 1, out, outlen, in, inlen);
+        ret = xmlUconvWrapper(handler->uconv_in, 1, out, outlen, in, inlen,
+                              flush);
     }
 #endif /* LIBXML_ICU_ENABLED */
     else {
@@ -1953,7 +1956,8 @@ xmlEncOutputChunk(xmlCharEncodingHandler *handler, unsigned char *out,
 #endif /* LIBXML_ICONV_ENABLED */
 #ifdef LIBXML_ICU_ENABLED
     else if (handler->uconv_out != NULL) {
-        ret = xmlUconvWrapper(handler->uconv_out, 0, out, outlen, in, inlen);
+        ret = xmlUconvWrapper(handler->uconv_out, 0, out, outlen, in, inlen,
+                              TRUE);
     }
 #endif /* LIBXML_ICU_ENABLED */
     else {
@@ -2015,7 +2019,7 @@ xmlCharEncFirstLineInt(xmlCharEncodingHandler *handler, xmlBufferPtr out,
     }
 
     ret = xmlEncInputChunk(handler, &out->content[out->use], &written,
-                           in->content, &toconv);
+                           in->content, &toconv, 0);
     xmlBufferShrink(in, toconv);
     out->use += written;
     out->content[out->use] = 0;
@@ -2133,7 +2137,7 @@ xmlCharEncFirstLineInput(xmlParserInputBufferPtr input, int len)
     c_in = toconv;
     c_out = written;
     ret = xmlEncInputChunk(input->encoder, xmlBufEnd(out), &c_out,
-                           xmlBufContent(in), &c_in);
+                           xmlBufContent(in), &c_in, 0);
     xmlBufShrink(in, c_in);
     xmlBufAddLen(out, c_out);
     if (ret == -1)
@@ -2231,7 +2235,7 @@ xmlCharEncInput(xmlParserInputBufferPtr input, int flush)
     c_in = toconv;
     c_out = written;
     ret = xmlEncInputChunk(input->encoder, xmlBufEnd(out), &c_out,
-                           xmlBufContent(in), &c_in);
+                           xmlBufContent(in), &c_in, flush);
     xmlBufShrink(in, c_in);
     xmlBufAddLen(out, c_out);
     if (ret == -1)
@@ -2317,7 +2321,7 @@ xmlCharEncInFunc(xmlCharEncodingHandler * handler, xmlBufferPtr out,
         written = out->size - out->use - 1;
     }
     ret = xmlEncInputChunk(handler, &out->content[out->use], &written,
-                           in->content, &toconv);
+                           in->content, &toconv, 1);
     xmlBufferShrink(in, toconv);
     out->use += written;
     out->content[out->use] = 0;
diff --git a/include/libxml/encoding.h b/include/libxml/encoding.h
index 7967cc66..c875af6f 100644
--- a/include/libxml/encoding.h
+++ b/include/libxml/encoding.h
@@ -129,9 +129,14 @@ typedef int (* xmlCharEncodingOutputFunc)(unsigned char *out, int *outlen,
  * If iconv is supported, there are two extra fields.
  */
 #ifdef LIBXML_ICU_ENABLED
+/* Size of pivot buffer, same as icu/source/common/ucnv.cpp CHUNK_SIZE */
+#define ICU_PIVOT_BUF_SIZE 1024
 struct _uconv_t {
   UConverter *uconv; /* for conversion between an encoding and UTF-16 */
   UConverter *utf8; /* for conversion between UTF-8 and UTF-16 */
+  UChar      pivot_buf[ICU_PIVOT_BUF_SIZE];
+  UChar      *pivot_source;
+  UChar      *pivot_target;
 };
 typedef struct _uconv_t uconv_t;
 #endif
diff --git a/testapi.c b/testapi.c
index 168ceb67..ef3f4733 100644
--- a/testapi.c
+++ b/testapi.c
@@ -8785,7 +8785,7 @@ test_xmlCharEncInFunc(void) {
         out = gen_xmlBufferPtr(n_out, 1);
         in = gen_xmlBufferPtr(n_in, 2);
 
-        ret_val = xmlCharEncInFunc(handler, out, in);
+        ret_val = xmlCharEncInFunc(handler, out, in, 1);
         desret_int(ret_val);
         call_tests++;
         des_xmlCharEncodingHandler_ptr(n_handler, handler, 0);
-- 
2.15.0.rc2.357.g7e34df9404-goog

<?xml version="1.0" encoding="UTF8-"?>
<foo>
Text with UTF8 chars at position 214 (0xd6)
______
_______________
_______________
_______________
_______________
_______________
_______________
_______________
_______񑘓£_
_</foo>
_______________________________________________
xml mailing list, project page  http://xmlsoft.org/
xml@gnome.org
https://mail.gnome.org/mailman/listinfo/xml

Reply via email to