This is an automated email from the ASF dual-hosted git repository. borisk pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/xerces-c.git
commit 8e21670f8dd6ffa8da789e5df8bb29f5229252fc Author: Alberto Massari <amass...@apache.org> AuthorDate: Sun Dec 15 21:18:15 2019 +0000 [XERCESC-2180] Remove assertion when a surrogate pair is split by the boundary of an input buffer (transcoders try to avoid this, but UTF-16 transcoder doesn't have this check in place). The reader now pulls in more data on demand. git-svn-id: https://svn.apache.org/repos/asf/xerces/c/trunk@1871620 13f79535-47bb-0310-9956-ffa450edef68 --- src/xercesc/internal/XMLReader.cpp | 67 +++++++++++++++++---- .../XSTSHarness/regression/XERCESC-2180/crash.xml | Bin 0 -> 6 bytes .../XSTSHarness/regression/XERCESC-2180/crash2.xml | Bin 0 -> 32778 bytes tests/src/XSTSHarness/regression/XercesXML.testSet | 8 +++ 4 files changed, 63 insertions(+), 12 deletions(-) diff --git a/src/xercesc/internal/XMLReader.cpp b/src/xercesc/internal/XMLReader.cpp index befe51c..405474a 100644 --- a/src/xercesc/internal/XMLReader.cpp +++ b/src/xercesc/internal/XMLReader.cpp @@ -646,11 +646,16 @@ bool XMLReader::getName(XMLBuffer& toFill, const bool token) if (!token) { if ((fCharBuf[fCharIndex] >= 0xD800) && (fCharBuf[fCharIndex] <= 0xDB7F)) { - // make sure one more char is in the buffer, the transcoder - // should put only a complete surrogate pair into the buffer - assert(fCharIndex+1 < fCharsAvail); - if ((fCharBuf[fCharIndex+1] < 0xDC00) || (fCharBuf[fCharIndex+1] > 0xDFFF)) - return false; + // if there isn't one more char in the buffer, read more data + if (fCharIndex+1 == fCharsAvail) + { + if (!refreshCharBuffer()) + return false; + // reset the start buffer to the new location of the cursor + charIndex_start = fCharIndex; + } + if ((fCharBuf[fCharIndex+1] < 0xDC00) || (fCharBuf[fCharIndex+1] > 0xDFFF)) + return false; // Looks ok, so lets eat it fCharIndex += 2; @@ -675,9 +680,21 @@ bool XMLReader::getName(XMLBuffer& toFill, const bool token) // break out. if ( (fCharBuf[fCharIndex] >= 0xD800) && (fCharBuf[fCharIndex] <= 0xDB7F) ) { - // make sure one more char is in the buffer, the transcoder - // should put only a complete surrogate pair into the buffer - assert(fCharIndex+1 < fCharsAvail); + // if there isn't one more char in the buffer, read more data + if (fCharIndex+1 == fCharsAvail) + { + // but first copy the accepted character(s), and update column + if (fCharIndex != charIndex_start) + { + fCurCol += (XMLFileLoc)(fCharIndex - charIndex_start); + toFill.append(&fCharBuf[charIndex_start], fCharIndex - charIndex_start); + } + + if (!refreshCharBuffer()) + break; + + charIndex_start = fCharIndex; + } if ( (fCharBuf[fCharIndex+1] < 0xDC00) || (fCharBuf[fCharIndex+1] > 0xDFFF) ) break; @@ -721,9 +738,14 @@ bool XMLReader::getNCName(XMLBuffer& toFill) // what's the point in living mannnn? Just give up now. We only do this // if its a name and not a name token that they want. if ((fCharBuf[fCharIndex] >= 0xD800) && (fCharBuf[fCharIndex] <= 0xDB7F)) { - // make sure one more char is in the buffer, the transcoder - // should put only a complete surrogate pair into the buffer - assert(fCharIndex+1 < fCharsAvail); + // if there isn't one more char in the buffer, read more data + if (fCharIndex+1 == fCharsAvail) + { + if (!refreshCharBuffer()) + return false; + // reset the start buffer to the new location of the cursor + charIndex_start = fCharIndex; + } if ((fCharBuf[fCharIndex+1] < 0xDC00) || (fCharBuf[fCharIndex+1] > 0xDFFF)) return false; @@ -758,7 +780,28 @@ bool XMLReader::getNCName(XMLBuffer& toFill) // Check the current char and take it if it's a name char while(fCharIndex < fCharsAvail) { - if((fCharBuf[fCharIndex] >= 0xD800) && (fCharBuf[fCharIndex] <= 0xDB7F) && fCharIndex+1 < fCharsAvail && ((fCharBuf[fCharIndex+1] < 0xDC00) || (fCharBuf[fCharIndex+1] > 0xDFFF))) fCharIndex+=2; + if((fCharBuf[fCharIndex] >= 0xD800) && (fCharBuf[fCharIndex] <= 0xDB7F)) + { + // if there isn't one more char in the buffer, read more data + if (fCharIndex+1 == fCharsAvail) + { + // but first copy the accepted character(s), and update column + if (fCharIndex != charIndex_start) + { + fCurCol += (XMLFileLoc)(fCharIndex - charIndex_start); + toFill.append(&fCharBuf[charIndex_start], fCharIndex - charIndex_start); + } + + if (!refreshCharBuffer()) + break; + + charIndex_start = fCharIndex; + } + if ( (fCharBuf[fCharIndex+1] < 0xDC00) || + (fCharBuf[fCharIndex+1] > 0xDFFF) ) + break; + fCharIndex += 2; + } else if(isNCNameChar(fCharBuf[fCharIndex])) fCharIndex++; else break; } diff --git a/tests/src/XSTSHarness/regression/XERCESC-2180/crash.xml b/tests/src/XSTSHarness/regression/XERCESC-2180/crash.xml new file mode 100644 index 0000000..a8de93b Binary files /dev/null and b/tests/src/XSTSHarness/regression/XERCESC-2180/crash.xml differ diff --git a/tests/src/XSTSHarness/regression/XERCESC-2180/crash2.xml b/tests/src/XSTSHarness/regression/XERCESC-2180/crash2.xml new file mode 100644 index 0000000..e46a9fd Binary files /dev/null and b/tests/src/XSTSHarness/regression/XERCESC-2180/crash2.xml differ diff --git a/tests/src/XSTSHarness/regression/XercesXML.testSet b/tests/src/XSTSHarness/regression/XercesXML.testSet new file mode 100644 index 0000000..a7a2427 --- /dev/null +++ b/tests/src/XSTSHarness/regression/XercesXML.testSet @@ -0,0 +1,8 @@ +<?xml version="1.0" encoding="utf-8"?> +<TESTSUITE> + <TESTCASES xml:base="XERCESC-2180"> + <!-- https://issues.apache.org/jira/browse/XERCESC-2180: Assertion when scanner splits a surrogate pair across two separate buffers --> + <TEST ID="XERCESC-2180" TYPE="invalid" URI="crash.xml"/> + <TEST ID="XERCESC-2180" TYPE="invalid" URI="crash2.xml"/> + </TESTCASES> +</TESTSUITE> \ No newline at end of file --------------------------------------------------------------------- To unsubscribe, e-mail: c-dev-unsubscr...@xerces.apache.org For additional commands, e-mail: c-dev-h...@xerces.apache.org