mrglavas 2004/07/18 12:57:42 Modified: java/src/org/apache/xerces/xinclude XIncludeHandler.java XIncludeTextReader.java XInclude11TextReader.java Log: JIRA Issue #992:
http://nagoya.apache.org/jira/browse/XERCESJ-992 The parse method of XIncludeTextReader was reading from the input stream one character at a time, accumulating all the characters in a buffer before reporting them down the pipeline. This would be a space hog for large text includes. It's more efficient to read and report multiple chunks. Applying the patch from Ankit Pasricha with some modifications and additions. Now reading chunks from the stream and making multiple callbacks to characters() instead of accumulating all the text in the buffer. Modified XIncludeTextReader so that it is reusable. Reusing XIncludeTextReader in XIncludeHandler. Also using input-buffer-size property to determine the size of the internal buffer used for processing text includes. Revision Changes Path 1.27 +88 -14 xml-xerces/java/src/org/apache/xerces/xinclude/XIncludeHandler.java Index: XIncludeHandler.java =================================================================== RCS file: /home/cvs/xml-xerces/java/src/org/apache/xerces/xinclude/XIncludeHandler.java,v retrieving revision 1.26 retrieving revision 1.27 diff -u -r1.26 -r1.27 --- XIncludeHandler.java 15 Apr 2004 04:51:56 -0000 1.26 +++ XIncludeHandler.java 18 Jul 2004 19:57:42 -0000 1.27 @@ -84,6 +84,11 @@ * <li>http://apache.org/xml/properties/internal/error-reporter</li> * <li>http://apache.org/xml/properties/internal/entity-resolver</li> * </ul> + * Optional property: + * <ul> + * <li>http://apache.org/xml/properties/input-buffer-size</li> + * </ul> + * * Furthermore, the <code>NamespaceContext</code> used in the pipeline is required * to be an instance of <code>XIncludeNamespaceSupport</code>. * </p> @@ -171,6 +176,10 @@ /** property identifier: security manager. */ protected static final String SECURITY_MANAGER = Constants.XERCES_PROPERTY_PREFIX + Constants.SECURITY_MANAGER_PROPERTY; + + /** property identifier: buffer size. */ + public static final String BUFFER_SIZE = + Constants.XERCES_PROPERTY_PREFIX + Constants.BUFFER_SIZE_PROPERTY; /** Recognized features. */ private static final String[] RECOGNIZED_FEATURES = @@ -181,10 +190,10 @@ /** Recognized properties. */ private static final String[] RECOGNIZED_PROPERTIES = - { ERROR_REPORTER, ENTITY_RESOLVER, SECURITY_MANAGER }; + { ERROR_REPORTER, ENTITY_RESOLVER, SECURITY_MANAGER, BUFFER_SIZE }; /** Property defaults. */ - private static final Object[] PROPERTY_DEFAULTS = { null, null, null }; + private static final Object[] PROPERTY_DEFAULTS = { null, null, null, new Integer(XMLEntityManager.DEFAULT_BUFFER_SIZE) }; // instance variables @@ -198,8 +207,11 @@ // for XIncludeHandler protected XIncludeHandler fParentXIncludeHandler; + + // for buffer size in XIncludeTextReader + protected int fBufferSize = XMLEntityManager.DEFAULT_BUFFER_SIZE; - // It's "feels wrong" to store this value here. However, + // It "feels wrong" to store this value here. However, // calculating it can be time consuming, so we cache it. // It's never going to change in the lifetime of this XIncludeHandler protected String fParentRelativeURI; @@ -213,6 +225,10 @@ protected XMLErrorReporter fErrorReporter; protected XMLEntityResolver fEntityResolver; protected SecurityManager fSecurityManager; + + // these are needed for text include processing + protected XIncludeTextReader fXInclude10TextReader; + protected XIncludeTextReader fXInclude11TextReader; // these are needed for XML Base processing protected XMLResourceIdentifier fCurrentBaseURI; @@ -370,6 +386,32 @@ catch (XMLConfigurationException e) { fSecurityManager = null; } + + // Get buffer size. + try { + Integer value = + (Integer)componentManager.getProperty( + BUFFER_SIZE); + + if (value != null && value.intValue() > 0) { + fBufferSize = value.intValue(); + } + else { + fBufferSize = ((Integer)getPropertyDefault(BUFFER_SIZE)).intValue(); + } + } + catch (XMLConfigurationException e) { + fBufferSize = ((Integer)getPropertyDefault(BUFFER_SIZE)).intValue(); + } + + // Reset XML 1.0 text reader. + if (fXInclude10TextReader != null) { + fXInclude10TextReader.setBufferSize(fBufferSize); + } + // Reset XML 1.1 text reader. + if (fXInclude11TextReader != null) { + fXInclude11TextReader.setBufferSize(fBufferSize); + } fSettings = new ParserConfigurationSettings(); copyFeatures(componentManager, fSettings); @@ -442,18 +484,36 @@ if (fChildConfig != null) { fChildConfig.setProperty(propertyId, value); } + return; } if (propertyId.equals(ENTITY_RESOLVER)) { fEntityResolver = (XMLEntityResolver)value; if (fChildConfig != null) { fChildConfig.setProperty(propertyId, value); } + return; } if (propertyId.equals(SECURITY_MANAGER)) { fSecurityManager = (SecurityManager)value; if (fChildConfig != null) { fChildConfig.setProperty(propertyId, value); } + return; + } + if (propertyId.equals(BUFFER_SIZE)) { + Integer bufferSize = (Integer) value; + if (bufferSize != null && bufferSize.intValue() > 0) { + fBufferSize = bufferSize.intValue(); + // Reset XML 1.0 text reader. + if (fXInclude10TextReader != null) { + fXInclude10TextReader.setBufferSize(fBufferSize); + } + // Reset XML 1.1 text reader. + if (fXInclude11TextReader != null) { + fXInclude11TextReader.setBufferSize(fBufferSize); + } + } + return; } } // setProperty(String,Object) @@ -1253,21 +1313,34 @@ // we only care about encoding for parse="text" String encoding = attributes.getValue(XINCLUDE_ATTR_ENCODING); includedSource.setEncoding(encoding); - - XIncludeTextReader reader = null; + XIncludeTextReader textReader = null; + try { - if (fIsXML11) { - reader = new XInclude11TextReader(includedSource, this); + // Setup the appropriate text reader. + if (!fIsXML11) { + if (fXInclude10TextReader == null) { + fXInclude10TextReader = new XIncludeTextReader(includedSource, this, fBufferSize); + } + else { + fXInclude10TextReader.setInputSource(includedSource); + } + textReader = fXInclude10TextReader; } else { - reader = new XIncludeTextReader(includedSource, this); + if (fXInclude11TextReader == null) { + fXInclude11TextReader = new XInclude11TextReader(includedSource, this, fBufferSize); + } + else { + fXInclude11TextReader.setInputSource(includedSource); + } + textReader = fXInclude11TextReader; } if (includedSource.getCharacterStream() == null && includedSource.getByteStream() == null) { - reader.setHttpProperties(accept, acceptLanguage); + textReader.setHttpProperties(accept, acceptLanguage); } - reader.setErrorReporter(fErrorReporter); - reader.parse(); + textReader.setErrorReporter(fErrorReporter); + textReader.parse(); } // encoding errors catch (MalformedByteSequenceException ex) { @@ -1285,9 +1358,9 @@ return false; } finally { - if (reader != null) { + if (textReader != null) { try { - reader.close(); + textReader.close(); } catch (IOException e) { reportResourceError( @@ -2192,4 +2265,5 @@ } return true; } + } 1.11 +83 -50 xml-xerces/java/src/org/apache/xerces/xinclude/XIncludeTextReader.java Index: XIncludeTextReader.java =================================================================== RCS file: /home/cvs/xml-xerces/java/src/org/apache/xerces/xinclude/XIncludeTextReader.java,v retrieving revision 1.10 retrieving revision 1.11 diff -u -r1.10 -r1.11 --- XIncludeTextReader.java 15 Apr 2004 04:51:56 -0000 1.10 +++ XIncludeTextReader.java 18 Jul 2004 19:57:42 -0000 1.11 @@ -25,15 +25,15 @@ import java.net.URLConnection; import java.util.Locale; +import org.apache.xerces.impl.XMLEntityManager; +import org.apache.xerces.impl.XMLErrorReporter; import org.apache.xerces.impl.io.ASCIIReader; import org.apache.xerces.impl.io.UTF8Reader; import org.apache.xerces.impl.msg.XMLMessageFormatter; -import org.apache.xerces.impl.XMLEntityManager; -import org.apache.xerces.impl.XMLErrorReporter; import org.apache.xerces.util.EncodingMap; import org.apache.xerces.util.MessageFormatter; import org.apache.xerces.util.XMLChar; -import org.apache.xerces.util.XMLStringBuffer; +import org.apache.xerces.xni.XMLString; import org.apache.xerces.xni.parser.XMLInputSource; /** @@ -50,6 +50,7 @@ * * @author Michael Glavassevich, IBM * @author Peter McCracken, IBM + * @author Ankit Pasricha, IBM * @author Arun Yadav, Sun Microsystems Inc. * * @version $Id$ @@ -62,6 +63,7 @@ private XIncludeHandler fHandler; private XMLInputSource fSource; private XMLErrorReporter fErrorReporter; + private XMLString fTempString = new XMLString(); // Content negotation parameters private String fAccept; @@ -72,11 +74,13 @@ * * @param source The XMLInputSource to use. * @param handler The XIncludeHandler to use. + * @param bufferSize The size of this text reader's buffer. */ - public XIncludeTextReader(XMLInputSource source, XIncludeHandler handler) + public XIncludeTextReader(XMLInputSource source, XIncludeHandler handler, int bufferSize) throws IOException { fHandler = handler; fSource = source; + fTempString = new XMLString(new char[bufferSize + 1], 0, 0); } /** @@ -121,7 +125,7 @@ stream = source.getByteStream(); // Wrap the InputStream so that it is possible to rewind it. if (!(stream instanceof BufferedInputStream)) { - stream = new BufferedInputStream(stream); + stream = new BufferedInputStream(stream, fTempString.ch.length); } } else { @@ -227,7 +231,7 @@ // this encoding has many aliases. if (encoding.equals("UTF-8")) { return new UTF8Reader(stream, - XMLEntityManager.DEFAULT_BUFFER_SIZE, + fTempString.ch.length, fErrorReporter.getMessageFormatter(XMLMessageFormatter.XML_DOMAIN), fErrorReporter.getLocale() ); } @@ -248,7 +252,7 @@ } else if (javaEncoding.equals("ASCII")) { return new ASCIIReader(stream, - XMLEntityManager.DEFAULT_BUFFER_SIZE, + fTempString.ch.length, fErrorReporter.getMessageFormatter(XMLMessageFormatter.XML_DOMAIN), fErrorReporter.getLocale() ); } @@ -405,56 +409,71 @@ * @throws IOException */ public void parse() throws IOException { - // REVISIT: This method needs to be rewritten to improve performance: both - // time and memory. We should be reading chunks and reporting chunks instead - // of reading characters individually and reporting all the characters in - // one callback. Also, currently we don't provide any locator information: - // line number, column number, etc... so if we report an error it will appear - // as if the invalid XML character was in the include parent. -- mrglavas - XMLStringBuffer buffer = new XMLStringBuffer(); + fReader = getReader(fSource); - int ch; - while((ch = fReader.read()) != -1) { - if (isValid(ch)) { - buffer.append((char)ch); - } - else if (XMLChar.isHighSurrogate(ch)) { - int ch2 = fReader.read(); - if (XMLChar.isLowSurrogate(ch2)) { - - // convert surrogates to a supplemental character - int sup = XMLChar.supplemental((char)ch, (char)ch2); - - // supplemental character must be a valid XML character - if (!isValid(sup)) { + fSource = null; + int readSize = fReader.read(fTempString.ch, 0, fTempString.ch.length - 1); + while (readSize != -1) { + for (int i = 0; i < readSize; ++i) { + char ch = fTempString.ch[i]; + if (!isValid(ch)) { + if (XMLChar.isHighSurrogate(ch)) { + int ch2; + // retrieve next character + if (++i < readSize) { + ch2 = fTempString.ch[i]; + } + // handle rare boundary case + else { + ch2 = fReader.read(); + if (ch2 != -1) { + fTempString.ch[readSize++] = (char) ch2; + } + } + if (XMLChar.isLowSurrogate(ch2)) { + // convert surrogates to a supplemental character + int sup = XMLChar.supplemental(ch, (char)ch2); + if (!isValid(sup)) { + fErrorReporter.reportError(XMLMessageFormatter.XML_DOMAIN, + "InvalidCharInContent", + new Object[] { Integer.toString(sup, 16) }, + XMLErrorReporter.SEVERITY_FATAL_ERROR); + } + } + else { + fErrorReporter.reportError(XMLMessageFormatter.XML_DOMAIN, + "InvalidCharInContent", + new Object[] { Integer.toString(ch2, 16) }, + XMLErrorReporter.SEVERITY_FATAL_ERROR); + } + } + else { fErrorReporter.reportError(XMLMessageFormatter.XML_DOMAIN, "InvalidCharInContent", - new Object[] { Integer.toString(sup, 16) }, + new Object[] { Integer.toString(ch, 16) }, XMLErrorReporter.SEVERITY_FATAL_ERROR); - continue; - } - buffer.append((char) ch); - buffer.append((char) ch2); - } - else { - fErrorReporter.reportError(XMLMessageFormatter.XML_DOMAIN, - "InvalidCharInContent", - new Object[] { Integer.toString(ch, 16) }, - XMLErrorReporter.SEVERITY_FATAL_ERROR); + } } } - else { - fErrorReporter.reportError(XMLMessageFormatter.XML_DOMAIN, - "InvalidCharInContent", - new Object[] { Integer.toString(ch, 16) }, - XMLErrorReporter.SEVERITY_FATAL_ERROR); + if (fHandler != null && readSize > 0) { + fTempString.offset = 0; + fTempString.length = readSize; + fHandler.characters( + fTempString, + fHandler.modifyAugmentations(null, true)); } + readSize = fReader.read(fTempString.ch, 0, fTempString.ch.length - 1); } - if (fHandler != null && buffer.length > 0) { - fHandler.characters( - buffer, - fHandler.modifyAugmentations(null, true)); - } + + } + + /** + * Sets the input source on this text reader. + * + * @param source The XMLInputSource to use. + */ + public void setInputSource(XMLInputSource source) { + fSource = source; } /** @@ -466,6 +485,7 @@ public void close() throws IOException { if (fReader != null) { fReader.close(); + fReader = null; } } @@ -478,4 +498,17 @@ protected boolean isValid(int ch) { return XMLChar.isValid(ch); } + + /** + * Sets the buffer size property for the reader which decides the chunk sizes that are parsed + * by the reader at a time and passed to the handler + * + * @param bufferSize The size of the buffer desired + */ + protected void setBufferSize(int bufferSize) { + if (fTempString.ch.length != ++bufferSize) { + fTempString.ch = new char[bufferSize]; + } + } + } 1.3 +4 -3 xml-xerces/java/src/org/apache/xerces/xinclude/XInclude11TextReader.java Index: XInclude11TextReader.java =================================================================== RCS file: /home/cvs/xml-xerces/java/src/org/apache/xerces/xinclude/XInclude11TextReader.java,v retrieving revision 1.2 retrieving revision 1.3 diff -u -r1.2 -r1.3 --- XInclude11TextReader.java 24 Feb 2004 23:15:52 -0000 1.2 +++ XInclude11TextReader.java 18 Jul 2004 19:57:42 -0000 1.3 @@ -40,10 +40,11 @@ * * @param source The XMLInputSource to use. * @param handler The XIncludeHandler to use. + * @param bufferSize The size of this text reader's buffer. */ - public XInclude11TextReader(XMLInputSource source, XIncludeHandler handler) + public XInclude11TextReader(XMLInputSource source, XIncludeHandler handler, int bufferSize) throws IOException { - super(source, handler); + super(source, handler, bufferSize); } /** --------------------------------------------------------------------- To unsubscribe, e-mail: [EMAIL PROTECTED] For additional commands, e-mail: [EMAIL PROTECTED]