[xerces-c] 04/13: [XERCESC-2180] Remove assertion when a surrogate pair is split by the boundary of an input buffer (transcoders try to avoid this, but UTF-16 transcoder doesn't have this check in place). The reader now pulls in more data on demand.

borisk Fri, 10 Jan 2020 05:47:48 -0800

This is an automated email from the ASF dual-hosted git repository.

borisk pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/xerces-c.git


commit 8e21670f8dd6ffa8da789e5df8bb29f5229252fc
Author: Alberto Massari <amass...@apache.org>
AuthorDate: Sun Dec 15 21:18:15 2019 +0000

    [XERCESC-2180] Remove assertion when a surrogate pair is split by the 
boundary
    of an input buffer (transcoders try to avoid this, but UTF-16 transcoder 
doesn't
    have this check in place). The reader now pulls in more data on demand.
    
    git-svn-id: https://svn.apache.org/repos/asf/xerces/c/trunk@1871620 
13f79535-47bb-0310-9956-ffa450edef68
---
 src/xercesc/internal/XMLReader.cpp                 |  67 +++++++++++++++++----
 .../XSTSHarness/regression/XERCESC-2180/crash.xml  | Bin 0 -> 6 bytes
 .../XSTSHarness/regression/XERCESC-2180/crash2.xml | Bin 0 -> 32778 bytes
 tests/src/XSTSHarness/regression/XercesXML.testSet |   8 +++
 4 files changed, 63 insertions(+), 12 deletions(-)

diff --git a/src/xercesc/internal/XMLReader.cpp 
b/src/xercesc/internal/XMLReader.cpp
index befe51c..405474a 100644
--- a/src/xercesc/internal/XMLReader.cpp
+++ b/src/xercesc/internal/XMLReader.cpp
@@ -646,11 +646,16 @@ bool XMLReader::getName(XMLBuffer& toFill, const bool 
token)
     if (!token)
     {
         if ((fCharBuf[fCharIndex] >= 0xD800) && (fCharBuf[fCharIndex] <= 
0xDB7F)) {
-           // make sure one more char is in the buffer, the transcoder
-           // should put only a complete surrogate pair into the buffer
-           assert(fCharIndex+1 < fCharsAvail);
-           if ((fCharBuf[fCharIndex+1] < 0xDC00) || (fCharBuf[fCharIndex+1] > 
0xDFFF))
-               return false;
+            // if there isn't one more char in the buffer, read more data
+            if (fCharIndex+1 == fCharsAvail)
+            {
+                if (!refreshCharBuffer())
+                    return false;
+                // reset the start buffer to the new location of the cursor
+                charIndex_start = fCharIndex;
+            }
+            if ((fCharBuf[fCharIndex+1] < 0xDC00) || (fCharBuf[fCharIndex+1] > 
0xDFFF))
+                return false;
 
             // Looks ok, so lets eat it
             fCharIndex += 2;
@@ -675,9 +680,21 @@ bool XMLReader::getName(XMLBuffer& toFill, const bool 
token)
             //  break out.
             if ( (fCharBuf[fCharIndex] >= 0xD800) && (fCharBuf[fCharIndex] <= 
0xDB7F) )
             {
-                // make sure one more char is in the buffer, the transcoder
-                // should put only a complete surrogate pair into the buffer
-                assert(fCharIndex+1 < fCharsAvail);
+                // if there isn't one more char in the buffer, read more data
+                if (fCharIndex+1 == fCharsAvail)
+                {
+                    // but first copy the accepted character(s), and update 
column
+                    if (fCharIndex != charIndex_start)
+                    {
+                        fCurCol += (XMLFileLoc)(fCharIndex - charIndex_start);
+                        toFill.append(&fCharBuf[charIndex_start], fCharIndex - 
charIndex_start);
+                    }
+
+                    if (!refreshCharBuffer())
+                        break;
+
+                    charIndex_start = fCharIndex;
+                }
                 if ( (fCharBuf[fCharIndex+1] < 0xDC00) ||
                         (fCharBuf[fCharIndex+1] > 0xDFFF)  )
                     break;
@@ -721,9 +738,14 @@ bool XMLReader::getNCName(XMLBuffer& toFill)
     //  what's the point in living mannnn? Just give up now. We only do this
     //  if its a name and not a name token that they want.
     if ((fCharBuf[fCharIndex] >= 0xD800) && (fCharBuf[fCharIndex] <= 0xDB7F)) {
-        // make sure one more char is in the buffer, the transcoder
-        // should put only a complete surrogate pair into the buffer
-        assert(fCharIndex+1 < fCharsAvail);
+        // if there isn't one more char in the buffer, read more data
+        if (fCharIndex+1 == fCharsAvail)
+        {
+            if (!refreshCharBuffer())
+                return false;
+            // reset the start buffer to the new location of the cursor
+            charIndex_start = fCharIndex;
+        }
         if ((fCharBuf[fCharIndex+1] < 0xDC00) || (fCharBuf[fCharIndex+1] > 
0xDFFF))
             return false;
 
@@ -758,7 +780,28 @@ bool XMLReader::getNCName(XMLBuffer& toFill)
         //  Check the current char and take it if it's a name char
         while(fCharIndex < fCharsAvail)
         {
-            if((fCharBuf[fCharIndex] >= 0xD800) && (fCharBuf[fCharIndex] <= 
0xDB7F) && fCharIndex+1 < fCharsAvail && ((fCharBuf[fCharIndex+1] < 0xDC00) || 
(fCharBuf[fCharIndex+1] > 0xDFFF))) fCharIndex+=2;
+            if((fCharBuf[fCharIndex] >= 0xD800) && (fCharBuf[fCharIndex] <= 
0xDB7F))
+            {
+                // if there isn't one more char in the buffer, read more data
+                if (fCharIndex+1 == fCharsAvail)
+                {
+                    // but first copy the accepted character(s), and update 
column
+                    if (fCharIndex != charIndex_start)
+                    {
+                        fCurCol += (XMLFileLoc)(fCharIndex - charIndex_start);
+                        toFill.append(&fCharBuf[charIndex_start], fCharIndex - 
charIndex_start);
+                    }
+
+                    if (!refreshCharBuffer())
+                        break;
+
+                    charIndex_start = fCharIndex;
+                }
+                if ( (fCharBuf[fCharIndex+1] < 0xDC00) ||
+                    (fCharBuf[fCharIndex+1] > 0xDFFF)  )
+                    break;
+                fCharIndex += 2;
+            }
             else if(isNCNameChar(fCharBuf[fCharIndex])) fCharIndex++;
             else break;
         }
diff --git a/tests/src/XSTSHarness/regression/XERCESC-2180/crash.xml 
b/tests/src/XSTSHarness/regression/XERCESC-2180/crash.xml
new file mode 100644
index 0000000..a8de93b
Binary files /dev/null and 
b/tests/src/XSTSHarness/regression/XERCESC-2180/crash.xml differ
diff --git a/tests/src/XSTSHarness/regression/XERCESC-2180/crash2.xml 
b/tests/src/XSTSHarness/regression/XERCESC-2180/crash2.xml
new file mode 100644
index 0000000..e46a9fd
Binary files /dev/null and 
b/tests/src/XSTSHarness/regression/XERCESC-2180/crash2.xml differ
diff --git a/tests/src/XSTSHarness/regression/XercesXML.testSet 
b/tests/src/XSTSHarness/regression/XercesXML.testSet
new file mode 100644
index 0000000..a7a2427
--- /dev/null
+++ b/tests/src/XSTSHarness/regression/XercesXML.testSet
@@ -0,0 +1,8 @@
+<?xml version="1.0" encoding="utf-8"?>
+<TESTSUITE>
+  <TESTCASES xml:base="XERCESC-2180">
+    <!-- https://issues.apache.org/jira/browse/XERCESC-2180: Assertion when 
scanner splits a surrogate pair across two separate buffers -->
+    <TEST ID="XERCESC-2180" TYPE="invalid" URI="crash.xml"/>
+    <TEST ID="XERCESC-2180" TYPE="invalid" URI="crash2.xml"/>
+  </TESTCASES>
+</TESTSUITE>
\ No newline at end of file


---------------------------------------------------------------------
To unsubscribe, e-mail: c-dev-unsubscr...@xerces.apache.org
For additional commands, e-mail: c-dev-h...@xerces.apache.org

[xerces-c] 04/13: [XERCESC-2180] Remove assertion when a surrogate pair is split by the boundary of an input buffer (transcoders try to avoid this, but UTF-16 transcoder doesn't have this check in place). The reader now pulls in more data on demand.

Reply via email to