sandygao 2003/06/17 10:57:14 Modified: java/src/org/apache/xerces/impl/io UTF8Reader.java Log: A performance improvement for UTF8 scanning. Revision Changes Path 1.6 +52 -39 xml-xerces/java/src/org/apache/xerces/impl/io/UTF8Reader.java Index: UTF8Reader.java =================================================================== RCS file: /home/cvs/xml-xerces/java/src/org/apache/xerces/impl/io/UTF8Reader.java,v retrieving revision 1.5 retrieving revision 1.6 diff -u -r1.5 -r1.6 --- UTF8Reader.java 23 Jul 2002 20:53:19 -0000 1.5 +++ UTF8Reader.java 17 Jun 2003 17:57:14 -0000 1.6 @@ -113,8 +113,8 @@ // Constructors // - /** - * Constructs a UTF-8 reader from the specified input stream + /** + * Constructs a UTF-8 reader from the specified input stream * using the default buffer size. Primarily for testing. * * @param inputStream The input stream. @@ -123,8 +123,8 @@ this(inputStream, DEFAULT_BUFFER_SIZE, new XMLMessageFormatter(), Locale.getDefault()); } // <init>(InputStream, MessageFormatter) - /** - * Constructs a UTF-8 reader from the specified input stream + /** + * Constructs a UTF-8 reader from the specified input stream * using the default buffer size and the given MessageFormatter. * * @param inputStream The input stream. @@ -136,8 +136,8 @@ this(inputStream, DEFAULT_BUFFER_SIZE, messageFormatter, locale); } // <init>(InputStream, MessageFormatter) - /** - * Constructs a UTF-8 reader from the specified input stream, + /** + * Constructs a UTF-8 reader from the specified input stream, * buffer size and MessageFormatter. * * @param inputStream The input stream. @@ -180,7 +180,7 @@ int index = 0; // get first byte - int b0 = index == fOffset + int b0 = index == fOffset ? fInputStream.read() : fBuffer[index++] & 0x00FF; if (b0 == -1) { return -1; @@ -195,7 +195,7 @@ // UTF-8: [110y yyyy] [10xx xxxx] // Unicode: [0000 0yyy] [yyxx xxxx] else if ((b0 & 0xE0) == 0xC0) { - int b1 = index == fOffset + int b1 = index == fOffset ? fInputStream.read() : fBuffer[index++] & 0x00FF; if (b1 == -1) { expectedByte(2, 2); @@ -217,7 +217,7 @@ if ((b1 & 0xC0) != 0x80) { invalidByte(2, 3, b1); } - int b2 = index == fOffset + int b2 = index == fOffset ? fInputStream.read() : fBuffer[index++] & 0x00FF; if (b2 == -1) { expectedByte(3, 3); @@ -234,7 +234,7 @@ // [1101 11yy] [yyxx xxxx] (low surrogate) // * uuuuu = wwww + 1 else if ((b0 & 0xF8) == 0xF0) { - int b1 = index == fOffset + int b1 = index == fOffset ? fInputStream.read() : fBuffer[index++] & 0x00FF; if (b1 == -1) { expectedByte(2, 4); @@ -242,7 +242,7 @@ if ((b1 & 0xC0) != 0x80) { invalidByte(2, 3, b1); } - int b2 = index == fOffset + int b2 = index == fOffset ? fInputStream.read() : fBuffer[index++] & 0x00FF; if (b2 == -1) { expectedByte(3, 4); @@ -250,7 +250,7 @@ if ((b2 & 0xC0) != 0x80) { invalidByte(3, 3, b2); } - int b3 = index == fOffset + int b3 = index == fOffset ? fInputStream.read() : fBuffer[index++] & 0x00FF; if (b3 == -1) { expectedByte(4, 4); @@ -263,8 +263,8 @@ invalidSurrogate(uuuuu); } int wwww = uuuuu - 1; - int hs = 0xD800 | - ((wwww << 6) & 0x03C0) | ((b1 << 2) & 0x003C) | + int hs = 0xD800 | + ((wwww << 6) & 0x03C0) | ((b1 << 2) & 0x003C) | ((b2 >> 4) & 0x0003); int ls = 0xDC00 | ((b2 << 6) & 0x03C0) | (b3 & 0x003F); c = hs; @@ -334,7 +334,7 @@ // skip read; last character was in error // NOTE: Having an offset value other than zero means that there was // an error in the last character read. In this case, we have - // skipped the read so we don't consume any bytes past the + // skipped the read so we don't consume any bytes past the // error. By signalling the error on the next block read we // allow the method to return the most valid characters that // it can on the previous block read. -Ac @@ -345,22 +345,35 @@ // convert bytes to characters final int total = count; - for (int in = 0; in < total; in++) { - int b0 = fBuffer[in] & 0x00FF; + int in; + byte byte1; + final byte byte0 = 0; + for (in = 0; in < total; in++) { + byte1 = fBuffer[in]; + if (byte1 >= byte0) { + ch[out++] = (char)byte1; + } + else { + break; + } + } + for ( ; in < total; in++) { + byte1 = fBuffer[in]; // UTF-8: [0xxx xxxx] // Unicode: [0000 0000] [0xxx xxxx] - if (b0 < 0x80) { - ch[out++] = (char)b0; + if (byte1 >= byte0) { + ch[out++] = (char)byte1; continue; } // UTF-8: [110y yyyy] [10xx xxxx] // Unicode: [0000 0yyy] [yyxx xxxx] + int b0 = byte1 & 0x0FF; if ((b0 & 0xE0) == 0xC0) { int b1 = -1; - if (++in < total) { - b1 = fBuffer[in] & 0x00FF; + if (++in < total) { + b1 = fBuffer[in] & 0x00FF; } else { b1 = fInputStream.read(); @@ -393,8 +406,8 @@ // Unicode: [zzzz yyyy] [yyxx xxxx] if ((b0 & 0xF0) == 0xE0) { int b1 = -1; - if (++in < total) { - b1 = fBuffer[in] & 0x00FF; + if (++in < total) { + b1 = fBuffer[in] & 0x00FF; } else { b1 = fInputStream.read(); @@ -418,8 +431,8 @@ invalidByte(2, 3, b1); } int b2 = -1; - if (++in < total) { - b2 = fBuffer[in] & 0x00FF; + if (++in < total) { + b2 = fBuffer[in] & 0x00FF; } else { b2 = fInputStream.read(); @@ -457,8 +470,8 @@ // * uuuuu = wwww + 1 if ((b0 & 0xF8) == 0xF0) { int b1 = -1; - if (++in < total) { - b1 = fBuffer[in] & 0x00FF; + if (++in < total) { + b1 = fBuffer[in] & 0x00FF; } else { b1 = fInputStream.read(); @@ -482,8 +495,8 @@ invalidByte(2, 4, b1); } int b2 = -1; - if (++in < total) { - b2 = fBuffer[in] & 0x00FF; + if (++in < total) { + b2 = fBuffer[in] & 0x00FF; } else { b2 = fInputStream.read(); @@ -509,8 +522,8 @@ invalidByte(3, 4, b2); } int b3 = -1; - if (++in < total) { - b3 = fBuffer[in] & 0x00FF; + if (++in < total) { + b3 = fBuffer[in] & 0x00FF; } else { b3 = fInputStream.read(); @@ -614,14 +627,14 @@ * @exception IOException If an I/O error occurs */ public boolean ready() throws IOException { - return false; + return false; } // ready() /** * Tell whether this stream supports the mark() operation. */ public boolean markSupported() { - return false; + return false; } // markSupported() /** @@ -638,7 +651,7 @@ * or if some other I/O error occurs */ public void mark(int readAheadLimit) throws IOException { - throw new IOException(fFormatter.formatMessage(fLocale, "OperationNotSupported", new Object[]{"mark()", "UTF-8"})); + throw new IOException(fFormatter.formatMessage(fLocale, "OperationNotSupported", new Object[]{"mark()", "UTF-8"})); } // mark(int) /** @@ -685,10 +698,10 @@ } // expectedByte(int,int,int) /** Throws an exception for invalid byte. */ - private void invalidByte(int position, int count, int c) + private void invalidByte(int position, int count, int c) throws UTFDataFormatException { - String message = fFormatter.formatMessage(fLocale, "InvalidByte", + String message = fFormatter.formatMessage(fLocale, "InvalidByte", new Object [] {Integer.toString(position), Integer.toString(count)}); throw new UTFDataFormatException(message); @@ -696,11 +709,11 @@ /** Throws an exception for invalid surrogate bits. */ private void invalidSurrogate(int uuuuu) throws UTFDataFormatException { - + StringBuffer str = new StringBuffer(); str.append("high surrogate bits in UTF-8 sequence must not exceed 0x10 but found 0x"); - String message = fFormatter.formatMessage(fLocale, "InvalidHighSurrogate", + String message = fFormatter.formatMessage(fLocale, "InvalidHighSurrogate", new Object[] {Integer.toHexString(uuuuu)}); throw new UTFDataFormatException(message);
--------------------------------------------------------------------- To unsubscribe, e-mail: [EMAIL PROTECTED] For additional commands, e-mail: [EMAIL PROTECTED]