andyc 2003/01/31 13:35:23 Modified: java/docs releases.xml java/src/org/apache/xerces/impl XMLDocumentFragmentScannerImpl.java XMLEntityScanner.java Log: Updated the scanner implementation to NOT buffer when scanning CDATA sections. This enables Xerces to use less memory and be able to parse very large CDATA sections which would previously cause out-of-memory exceptions. Reported by: Jim Layer <[EMAIL PROTECTED]> Revision Changes Path 1.148 +14 -2 xml-xerces/java/docs/releases.xml Index: releases.xml =================================================================== RCS file: /home/cvs/xml-xerces/java/docs/releases.xml,v retrieving revision 1.147 retrieving revision 1.148 diff -u -r1.147 -r1.148 --- releases.xml 27 Jan 2003 17:06:19 -0000 1.147 +++ releases.xml 31 Jan 2003 21:35:22 -0000 1.148 @@ -1,7 +1,19 @@ -<?xml version='1.0' encoding='UTF-8'?> +<?xml version='1.0' encoding='UTF-8'?> <!-- $Id$ --> <!DOCTYPE releases SYSTEM 'dtd/releases.dtd'> <releases> + <release version='&ParserName; TBD'> + <desc> + To be determined... + </desc> + <fix> + <note> + Fixed scanner implementation to be able to handle large CDATA sections + without buffering. + </note> + <submitter name='Andy Clark'/> + </fix> + </release> <release version="&ParserName; 2.3.0"> <desc> With this release, the Xerces-J developers are declaring the Xerces 1.27 +2 -2 xml-xerces/java/src/org/apache/xerces/impl/XMLDocumentFragmentScannerImpl.java Index: XMLDocumentFragmentScannerImpl.java =================================================================== RCS file: /home/cvs/xml-xerces/java/src/org/apache/xerces/impl/XMLDocumentFragmentScannerImpl.java,v retrieving revision 1.26 retrieving revision 1.27 diff -u -r1.26 -r1.27 --- XMLDocumentFragmentScannerImpl.java 7 Dec 2002 00:15:58 -0000 1.26 +++ XMLDocumentFragmentScannerImpl.java 31 Jan 2003 21:35:22 -0000 1.27 @@ -963,7 +963,7 @@ while (true) { fStringBuffer.clear(); - if (!fEntityScanner.scanData("]]", fStringBuffer)) { + if (fEntityScanner.scanData("]]", fStringBuffer)) { if (fDocumentHandler != null && fStringBuffer.length > 0) { fDocumentHandler.characters(fStringBuffer, null); } 1.10 +140 -131 xml-xerces/java/src/org/apache/xerces/impl/XMLEntityScanner.java Index: XMLEntityScanner.java =================================================================== RCS file: /home/cvs/xml-xerces/java/src/org/apache/xerces/impl/XMLEntityScanner.java,v retrieving revision 1.9 retrieving revision 1.10 diff -u -r1.9 -r1.10 --- XMLEntityScanner.java 16 Dec 2002 01:26:19 -0000 1.9 +++ XMLEntityScanner.java 31 Jan 2003 21:35:22 -0000 1.10 @@ -890,167 +890,176 @@ public boolean scanData(String delimiter, XMLStringBuffer buffer) throws IOException { - boolean done = false; + // REVISIT: This method does not need to use a string buffer. + // The change would avoid the array copies and increase + // performance. -Ac + // + // Currently, this method is only called for scanning + // CDATA sections and processing instruction data. So + // if this code is updated to NOT buffer, the scanning + // code for processing instructions will need to be + // updated to do its own buffering. The code for CDATA + // sections is safe as-is. -Ac + + boolean found = false; int delimLen = delimiter.length(); char charAt0 = delimiter.charAt(0); boolean external = fCurrentEntity.isExternal(); - do { + if (DEBUG_BUFFER) { + System.out.print("(scanData: "); + XMLEntityManager.print(fCurrentEntity); + System.out.println(); + } + + // load more characters, if needed + + if (fCurrentEntity.position == fCurrentEntity.count) { + load(0, true); + } + + boolean bNextEntity = false; + + while ((fCurrentEntity.position >= fCurrentEntity.count - delimLen) + && (!bNextEntity)) + { + System.arraycopy(fCurrentEntity.ch, + fCurrentEntity.position, + fCurrentEntity.ch, + 0, + fCurrentEntity.count - fCurrentEntity.position); + + bNextEntity = load(fCurrentEntity.count - fCurrentEntity.position, false); + fCurrentEntity.position = 0; + } + + if (fCurrentEntity.position >= fCurrentEntity.count - delimLen) { + // something must be wrong with the input: e.g., file ends an unterminated comment + int length = fCurrentEntity.count - fCurrentEntity.position; + buffer.append (fCurrentEntity.ch, fCurrentEntity.position, length); + fCurrentEntity.columnNumber += fCurrentEntity.count; + fCurrentEntity.position = fCurrentEntity.count; + load(0,true); + return false; + } + + // normalize newlines + int offset = fCurrentEntity.position; + int c = fCurrentEntity.ch[offset]; + int newlines = 0; + if (c == '\n' || (c == '\r' && external)) { if (DEBUG_BUFFER) { - System.out.print("(scanData: "); + System.out.print("[newline, "+offset+", "+fCurrentEntity.position+": "); XMLEntityManager.print(fCurrentEntity); System.out.println(); } - - // load more characters, if needed - - if (fCurrentEntity.position == fCurrentEntity.count) { - load(0, true); - } - - boolean bNextEntity = false; - - while ((fCurrentEntity.position >= fCurrentEntity.count - delimLen) - && (!bNextEntity)) - { - System.arraycopy(fCurrentEntity.ch, - fCurrentEntity.position, - fCurrentEntity.ch, - 0, - fCurrentEntity.count - fCurrentEntity.position); - - bNextEntity = load(fCurrentEntity.count - fCurrentEntity.position, false); - fCurrentEntity.position = 0; - } - - if (fCurrentEntity.position >= fCurrentEntity.count - delimLen) { - // something must be wrong with the input: e.g., file ends an unterminated comment - int length = fCurrentEntity.count - fCurrentEntity.position; - buffer.append (fCurrentEntity.ch, fCurrentEntity.position, length); - fCurrentEntity.columnNumber += fCurrentEntity.count; - fCurrentEntity.position = fCurrentEntity.count; - load(0,true); - return false; - } - - // normalize newlines - int offset = fCurrentEntity.position; - int c = fCurrentEntity.ch[offset]; - int newlines = 0; - if (c == '\n' || (c == '\r' && external)) { - if (DEBUG_BUFFER) { - System.out.print("[newline, "+offset+", "+fCurrentEntity.position+": "); - XMLEntityManager.print(fCurrentEntity); - System.out.println(); - } - do { - c = fCurrentEntity.ch[fCurrentEntity.position++]; - if (c == '\r' && external) { - newlines++; - fCurrentEntity.lineNumber++; - fCurrentEntity.columnNumber = 1; - if (fCurrentEntity.position == fCurrentEntity.count) { - offset = 0; - fCurrentEntity.position = newlines; - if (load(newlines, false)) { - break; - } - } - if (fCurrentEntity.ch[fCurrentEntity.position] == '\n') { - fCurrentEntity.position++; - offset++; - } - /*** NEWLINE NORMALIZATION ***/ - else { - newlines++; + do { + c = fCurrentEntity.ch[fCurrentEntity.position++]; + if (c == '\r' && external) { + newlines++; + fCurrentEntity.lineNumber++; + fCurrentEntity.columnNumber = 1; + if (fCurrentEntity.position == fCurrentEntity.count) { + offset = 0; + fCurrentEntity.position = newlines; + if (load(newlines, false)) { + break; } } - else if (c == '\n') { - newlines++; - fCurrentEntity.lineNumber++; - fCurrentEntity.columnNumber = 1; - if (fCurrentEntity.position == fCurrentEntity.count) { - offset = 0; - fCurrentEntity.position = newlines; - fCurrentEntity.count = newlines; - if (load(newlines, false)) { - break; - } - } + if (fCurrentEntity.ch[fCurrentEntity.position] == '\n') { + fCurrentEntity.position++; + offset++; } + /*** NEWLINE NORMALIZATION ***/ else { - fCurrentEntity.position--; - break; + newlines++; } - } while (fCurrentEntity.position < fCurrentEntity.count - 1); - for (int i = offset; i < fCurrentEntity.position; i++) { - fCurrentEntity.ch[i] = '\n'; } - int length = fCurrentEntity.position - offset; - if (fCurrentEntity.position == fCurrentEntity.count - 1) { - buffer.append(fCurrentEntity.ch, offset, length); - if (DEBUG_BUFFER) { - System.out.print("]newline, "+offset+", "+fCurrentEntity.position+": "); - XMLEntityManager.print(fCurrentEntity); - System.out.println(); + else if (c == '\n') { + newlines++; + fCurrentEntity.lineNumber++; + fCurrentEntity.columnNumber = 1; + if (fCurrentEntity.position == fCurrentEntity.count) { + offset = 0; + fCurrentEntity.position = newlines; + fCurrentEntity.count = newlines; + if (load(newlines, false)) { + break; + } } - return true; } + else { + fCurrentEntity.position--; + break; + } + } while (fCurrentEntity.position < fCurrentEntity.count - 1); + for (int i = offset; i < fCurrentEntity.position; i++) { + fCurrentEntity.ch[i] = '\n'; + } + int length = fCurrentEntity.position - offset; + if (fCurrentEntity.position == fCurrentEntity.count - 1) { + buffer.append(fCurrentEntity.ch, offset, length); if (DEBUG_BUFFER) { System.out.print("]newline, "+offset+", "+fCurrentEntity.position+": "); XMLEntityManager.print(fCurrentEntity); System.out.println(); } + return true; + } + if (DEBUG_BUFFER) { + System.out.print("]newline, "+offset+", "+fCurrentEntity.position+": "); + XMLEntityManager.print(fCurrentEntity); + System.out.println(); } + } - // iterate over buffer looking for delimiter - OUTER: while (fCurrentEntity.position < fCurrentEntity.count) { - c = fCurrentEntity.ch[fCurrentEntity.position++]; - if (c == charAt0) { - // looks like we just hit the delimiter - int delimOffset = fCurrentEntity.position - 1; - for (int i = 1; i < delimLen; i++) { - if (fCurrentEntity.position == fCurrentEntity.count) { - fCurrentEntity.position -= i; - break OUTER; - } - c = fCurrentEntity.ch[fCurrentEntity.position++]; - if (delimiter.charAt(i) != c) { - fCurrentEntity.position--; - break; - } + // iterate over buffer looking for delimiter + OUTER: while (fCurrentEntity.position < fCurrentEntity.count) { + c = fCurrentEntity.ch[fCurrentEntity.position++]; + if (c == charAt0) { + // looks like we just hit the delimiter + int delimOffset = fCurrentEntity.position - 1; + for (int i = 1; i < delimLen; i++) { + if (fCurrentEntity.position == fCurrentEntity.count) { + fCurrentEntity.position -= i; + break OUTER; } - if (fCurrentEntity.position == delimOffset + delimLen) { - done = true; + c = fCurrentEntity.ch[fCurrentEntity.position++]; + if (delimiter.charAt(i) != c) { + fCurrentEntity.position--; break; } } - else if (c == '\n' || (external && c == '\r')) { - fCurrentEntity.position--; + if (fCurrentEntity.position == delimOffset + delimLen) { + found = true; break; } - else if (XMLChar.isInvalid(c)) { - fCurrentEntity.position--; - int length = fCurrentEntity.position - offset; - fCurrentEntity.columnNumber += length - newlines; - buffer.append(fCurrentEntity.ch, offset, length); - return true; - } } - int length = fCurrentEntity.position - offset; - fCurrentEntity.columnNumber += length - newlines; - if (done) { - length -= delimLen; - } - buffer.append (fCurrentEntity.ch, offset, length); - - // return true if string was skipped - if (DEBUG_BUFFER) { - System.out.print(")scanData: "); - XMLEntityManager.print(fCurrentEntity); - System.out.println(" -> " + done); + else if (c == '\n' || (external && c == '\r')) { + fCurrentEntity.position--; + break; } - } while (!done); - return !done; + else if (XMLChar.isInvalid(c)) { + fCurrentEntity.position--; + int length = fCurrentEntity.position - offset; + fCurrentEntity.columnNumber += length - newlines; + buffer.append(fCurrentEntity.ch, offset, length); + return true; + } + } + int length = fCurrentEntity.position - offset; + fCurrentEntity.columnNumber += length - newlines; + if (found) { + length -= delimLen; + } + buffer.append (fCurrentEntity.ch, offset, length); + + // return true if string was skipped + if (DEBUG_BUFFER) { + System.out.print(")scanData: "); + XMLEntityManager.print(fCurrentEntity); + System.out.println(" -> " + found); + } + return found; } // scanData(String,XMLString)
--------------------------------------------------------------------- To unsubscribe, e-mail: [EMAIL PROTECTED] For additional commands, e-mail: [EMAIL PROTECTED]