neilg 2002/09/11 13:22:35 Modified: java/src/org/apache/xerces/impl XMLEntityManager.java XMLEntityScanner.java XMLDocumentFragmentScannerImpl.java XMLScanner.java Log: this change attempts to address poor performance parsing documents with very large comments. I observed between 10% and 15% improvement (depending on the kind of parser being used) on a 200K file with a 100K comment. Revision Changes Path 1.43 +123 -127 xml-xerces/java/src/org/apache/xerces/impl/XMLEntityManager.java Index: XMLEntityManager.java =================================================================== RCS file: /home/cvs/xml-xerces/java/src/org/apache/xerces/impl/XMLEntityManager.java,v retrieving revision 1.42 retrieving revision 1.43 diff -u -r1.42 -r1.43 --- XMLEntityManager.java 10 Sep 2002 14:20:07 -0000 1.42 +++ XMLEntityManager.java 11 Sep 2002 20:22:35 -0000 1.43 @@ -79,6 +79,7 @@ import org.apache.xerces.impl.validation.ValidationManager; import org.apache.xerces.util.EncodingMap; +import org.apache.xerces.util.XMLStringBuffer; import org.apache.xerces.util.SymbolTable; import org.apache.xerces.util.URI; import org.apache.xerces.util.XMLChar; @@ -2740,7 +2741,7 @@ } // scanLiteral(int,XMLString):int /** - * Scans a range of character data up to the specicied delimiter, + * Scans a range of character data up to the specified delimiter, * setting the fields of the XMLString structure, appropriately. * <p> * <strong>Note:</strong> The characters are consumed. @@ -2769,166 +2770,161 @@ * @throws IOException Thrown if i/o error occurs. * @throws EOFException Thrown on end of file. */ - public boolean scanData(String delimiter, XMLString data) + public boolean scanData(String delimiter, XMLStringBuffer buffer) throws IOException { - if (DEBUG_BUFFER) { - System.out.print("(scanData: "); - print(); - System.out.println(); - } - - // load more characters, if needed + boolean done = false; int delimLen = delimiter.length(); char charAt0 = delimiter.charAt(0); - //int limit = fCurrentEntity.count - delimLen + 1; - - if (fCurrentEntity.position == fCurrentEntity.count) { - load(0, true); - } - else if (fCurrentEntity.position >= fCurrentEntity.count - delimLen) { - System.arraycopy(fCurrentEntity.ch, fCurrentEntity.position, - fCurrentEntity.ch, 0, fCurrentEntity.count - fCurrentEntity.position); - load(fCurrentEntity.count - fCurrentEntity.position, false); - fCurrentEntity.position = 0; - } - if (fCurrentEntity.position >= fCurrentEntity.count - delimLen) { - // something must be wrong with the input: e.g., file ends an unterminated comment - int length = fCurrentEntity.count - fCurrentEntity.position; - data.setValues(fCurrentEntity.ch, fCurrentEntity.position, length); - fCurrentEntity.columnNumber += fCurrentEntity.count; - fCurrentEntity.position = fCurrentEntity.count; - load(0,true); - return false; - } - - // normalize newlines int offset = fCurrentEntity.position; int c = fCurrentEntity.ch[offset]; int newlines = 0; boolean external = fCurrentEntity.isExternal(); - if (c == '\n' || (c == '\r' && external)) { + do { if (DEBUG_BUFFER) { - System.out.print("[newline, "+offset+", "+fCurrentEntity.position+": "); + System.out.print("(scanData: "); print(); System.out.println(); } - do { - c = fCurrentEntity.ch[fCurrentEntity.position++]; - if (c == '\r' && external) { - newlines++; - fCurrentEntity.lineNumber++; - fCurrentEntity.columnNumber = 1; - /***/ - if (fCurrentEntity.position == fCurrentEntity.count) { - offset = 0; - fCurrentEntity.position = newlines; - if (load(newlines, false)) { - break; + + // load more characters, if needed + + if (fCurrentEntity.position == fCurrentEntity.count) { + load(0, true); + } + else if (fCurrentEntity.position >= fCurrentEntity.count - delimLen) { + System.arraycopy(fCurrentEntity.ch, fCurrentEntity.position, + fCurrentEntity.ch, 0, fCurrentEntity.count - fCurrentEntity.position); + load(fCurrentEntity.count - fCurrentEntity.position, false); + fCurrentEntity.position = 0; + } + if (fCurrentEntity.position >= fCurrentEntity.count - delimLen) { + // something must be wrong with the input: e.g., file ends an unterminated comment + int length = fCurrentEntity.count - fCurrentEntity.position; + buffer.append (fCurrentEntity.ch, fCurrentEntity.position, length); + fCurrentEntity.columnNumber += fCurrentEntity.count; + fCurrentEntity.position = fCurrentEntity.count; + load(0,true); + return false; + } + + // normalize newlines + offset = fCurrentEntity.position; + c = fCurrentEntity.ch[offset]; + newlines = 0; + if (c == '\n' || (c == '\r' && external)) { + if (DEBUG_BUFFER) { + System.out.print("[newline, "+offset+", "+fCurrentEntity.position+": "); + print(); + System.out.println(); + } + do { + c = fCurrentEntity.ch[fCurrentEntity.position++]; + if (c == '\r' && external) { + newlines++; + fCurrentEntity.lineNumber++; + fCurrentEntity.columnNumber = 1; + if (fCurrentEntity.position == fCurrentEntity.count) { + offset = 0; + fCurrentEntity.position = newlines; + if (load(newlines, false)) { + break; + } + } + if (fCurrentEntity.ch[fCurrentEntity.position] == '\n') { + fCurrentEntity.position++; + offset++; + } + /*** NEWLINE NORMALIZATION ***/ + else { + newlines++; } } - /***/ - if (fCurrentEntity.ch[fCurrentEntity.position] == '\n') { - fCurrentEntity.position++; - offset++; - } - /*** NEWLINE NORMALIZATION ***/ - else { + else if (c == '\n') { newlines++; - } - /***/ - } - else if (c == '\n') { - newlines++; - fCurrentEntity.lineNumber++; - fCurrentEntity.columnNumber = 1; - /***/ - if (fCurrentEntity.position == fCurrentEntity.count) { - offset = 0; - fCurrentEntity.position = newlines; - fCurrentEntity.count = newlines; - if (load(newlines, false)) { - break; + fCurrentEntity.lineNumber++; + fCurrentEntity.columnNumber = 1; + if (fCurrentEntity.position == fCurrentEntity.count) { + offset = 0; + fCurrentEntity.position = newlines; + fCurrentEntity.count = newlines; + if (load(newlines, false)) { + break; + } } } - /***/ - /*** NEWLINE NORMALIZATION *** - if (fCurrentEntity.ch[fCurrentEntity.position] == '\r' - && external) { - fCurrentEntity.position++; - offset++; + else { + fCurrentEntity.position--; + break; } - /***/ + } while (fCurrentEntity.position < fCurrentEntity.count - 1); + for (int i = offset; i < fCurrentEntity.position; i++) { + fCurrentEntity.ch[i] = '\n'; } - else { - fCurrentEntity.position--; - break; + int length = fCurrentEntity.position - offset; + if (fCurrentEntity.position == fCurrentEntity.count - 1) { + buffer.append(fCurrentEntity.ch, offset, length); + if (DEBUG_BUFFER) { + System.out.print("]newline, "+offset+", "+fCurrentEntity.position+": "); + print(); + System.out.println(); + } + return true; } - } while (fCurrentEntity.position < fCurrentEntity.count - 1); - for (int i = offset; i < fCurrentEntity.position; i++) { - fCurrentEntity.ch[i] = '\n'; - } - int length = fCurrentEntity.position - offset; - if (fCurrentEntity.position == fCurrentEntity.count - 1) { - data.setValues(fCurrentEntity.ch, offset, length); if (DEBUG_BUFFER) { System.out.print("]newline, "+offset+", "+fCurrentEntity.position+": "); print(); System.out.println(); } - return true; - } - if (DEBUG_BUFFER) { - System.out.print("]newline, "+offset+", "+fCurrentEntity.position+": "); - print(); - System.out.println(); } - } - - // iterate over buffer looking for delimiter - boolean done = false; - OUTER: while (fCurrentEntity.position < fCurrentEntity.count) { - c = fCurrentEntity.ch[fCurrentEntity.position++]; - if (c == charAt0) { - // looks like we just hit the delimiter - int delimOffset = fCurrentEntity.position - 1; - for (int i = 1; i < delimLen; i++) { - if (fCurrentEntity.position == fCurrentEntity.count) { - fCurrentEntity.position -= i; - break OUTER; + + // iterate over buffer looking for delimiter + OUTER: while (fCurrentEntity.position < fCurrentEntity.count) { + c = fCurrentEntity.ch[fCurrentEntity.position++]; + if (c == charAt0) { + // looks like we just hit the delimiter + int delimOffset = fCurrentEntity.position - 1; + for (int i = 1; i < delimLen; i++) { + if (fCurrentEntity.position == fCurrentEntity.count) { + fCurrentEntity.position -= i; + break OUTER; + } + c = fCurrentEntity.ch[fCurrentEntity.position++]; + if (delimiter.charAt(i) != c) { + fCurrentEntity.position--; + break; + } } - c = fCurrentEntity.ch[fCurrentEntity.position++]; - if (delimiter.charAt(i) != c) { - fCurrentEntity.position--; + if (fCurrentEntity.position == delimOffset + delimLen) { + done = true; break; } } - if (fCurrentEntity.position == delimOffset + delimLen) { - done = true; + else if (c == '\n' || (external && c == '\r')) { + fCurrentEntity.position--; break; } + else if (XMLChar.isInvalid(c)) { + fCurrentEntity.position--; + int length = fCurrentEntity.position - offset; + fCurrentEntity.columnNumber += length - newlines; + buffer.append(fCurrentEntity.ch, offset, length); + return true; + } } - else if (c == '\n' || (external && c == '\r')) { - fCurrentEntity.position--; - break; + int length = fCurrentEntity.position - offset; + fCurrentEntity.columnNumber += length - newlines; + if (done) { + length -= delimLen; } - else if (XMLChar.isInvalid(c)) { - fCurrentEntity.position--; - break; + buffer.append (fCurrentEntity.ch, offset, length); + + // return true if string was skipped + if (DEBUG_BUFFER) { + System.out.print(")scanData: "); + print(); + System.out.println(" -> " + done); } - } - int length = fCurrentEntity.position - offset; - fCurrentEntity.columnNumber += length - newlines; - if (done) { - length -= delimLen; - } - data.setValues(fCurrentEntity.ch, offset, length); - - // return true if string was skipped - if (DEBUG_BUFFER) { - System.out.print(")scanData: "); - print(); - System.out.println(" -> " + done); - } + } while (!done); return !done; } // scanData(String,XMLString) 1.7 +2 -1 xml-xerces/java/src/org/apache/xerces/impl/XMLEntityScanner.java Index: XMLEntityScanner.java =================================================================== RCS file: /home/cvs/xml-xerces/java/src/org/apache/xerces/impl/XMLEntityScanner.java,v retrieving revision 1.6 retrieving revision 1.7 diff -u -r1.6 -r1.7 --- XMLEntityScanner.java 10 Sep 2002 14:20:07 -0000 1.6 +++ XMLEntityScanner.java 11 Sep 2002 20:22:35 -0000 1.7 @@ -63,6 +63,7 @@ import org.apache.xerces.xni.QName; import org.apache.xerces.xni.XMLLocator; import org.apache.xerces.xni.XMLString; +import org.apache.xerces.util.XMLStringBuffer; /** * This class allows various parser scanners to scan basic XML constructs @@ -288,7 +289,7 @@ * @throws IOException Thrown if i/o error occurs. * @throws EOFException Thrown on end of file. */ - public abstract boolean scanData(String delimiter, XMLString data) + public abstract boolean scanData(String delimiter, XMLStringBuffer data) throws IOException; /** 1.19 +5 -4 xml-xerces/java/src/org/apache/xerces/impl/XMLDocumentFragmentScannerImpl.java Index: XMLDocumentFragmentScannerImpl.java =================================================================== RCS file: /home/cvs/xml-xerces/java/src/org/apache/xerces/impl/XMLDocumentFragmentScannerImpl.java,v retrieving revision 1.18 retrieving revision 1.19 diff -u -r1.18 -r1.19 --- XMLDocumentFragmentScannerImpl.java 10 Sep 2002 14:20:07 -0000 1.18 +++ XMLDocumentFragmentScannerImpl.java 11 Sep 2002 20:22:35 -0000 1.19 @@ -898,9 +898,10 @@ } while (true) { - if (!fEntityScanner.scanData("]]", fString)) { - if (fDocumentHandler != null && fString.length > 0) { - fDocumentHandler.characters(fString, null); + fStringBuffer.clear(); + if (!fEntityScanner.scanData("]]", fStringBuffer)) { + if (fDocumentHandler != null && fStringBuffer.length > 0) { + fDocumentHandler.characters(fStringBuffer, null); } int brackets = 2; while (fEntityScanner.skipChar(']')) { @@ -924,7 +925,7 @@ } else { if (fDocumentHandler != null) { - fDocumentHandler.characters(fString, null); + fDocumentHandler.characters(fStringBuffer, null); } int c = fEntityScanner.peekChar(); if (c != -1 && XMLChar.isInvalid(c)) { 1.20 +4 -9 xml-xerces/java/src/org/apache/xerces/impl/XMLScanner.java Index: XMLScanner.java =================================================================== RCS file: /home/cvs/xml-xerces/java/src/org/apache/xerces/impl/XMLScanner.java,v retrieving revision 1.19 retrieving revision 1.20 diff -u -r1.19 -r1.20 --- XMLScanner.java 10 Sep 2002 14:20:07 -0000 1.19 +++ XMLScanner.java 11 Sep 2002 20:22:35 -0000 1.20 @@ -637,11 +637,10 @@ } } + fStringBuffer.clear(); // data - if (fEntityScanner.scanData("?>", data)) { - fStringBuffer.clear(); + if (fEntityScanner.scanData("?>", fStringBuffer)) { do { - fStringBuffer.append(data); int c = fEntityScanner.peekChar(); if (c != -1) { if (XMLChar.isHighSurrogate(c)) { @@ -653,8 +652,7 @@ fEntityScanner.scanChar(); } } - } while (fEntityScanner.scanData("?>", data)); - fStringBuffer.append(data); + } while (fEntityScanner.scanData("?>", fStringBuffer)); data.setValues(fStringBuffer); } @@ -679,9 +677,7 @@ // text // REVISIT: handle invalid character, eof text.clear(); - while (fEntityScanner.scanData("--", fString)) { - text.append(fString); - /***/ + while (fEntityScanner.scanData("--", text)) { int c = fEntityScanner.peekChar(); if (c != -1) { if (XMLChar.isHighSurrogate(c)) { @@ -694,7 +690,6 @@ } } } - text.append(fString); if (!fEntityScanner.skipChar('>')) { reportFatalError("DashDashInComment", null); }
--------------------------------------------------------------------- To unsubscribe, e-mail: [EMAIL PROTECTED] For additional commands, e-mail: [EMAIL PROTECTED]