mrglavas 2004/01/22 12:41:21 Modified: java/src/org/apache/xerces/impl/io UTF8Reader.java Log: Fixing Bug #24579: http://nagoya.apache.org/bugzilla/show_bug.cgi?id=24579 XML 1.0 SE - E27. According to Unicode 3.1 conformant UTF-8 interpreters must reject non-shortest form byte sequences. Some examples are C0 80 and E0 80 80, bothl corresponding to codepoint 0. Extra checks are required for all multi-byte sequences. Currently this is done with anding the bytes with a mask. We should revisit this code to check whether it would be faster to check the value of the character after the bytes have been combined. Revision Changes Path 1.8 +14 -8 xml-xerces/java/src/org/apache/xerces/impl/io/UTF8Reader.java Index: UTF8Reader.java =================================================================== RCS file: /home/cvs/xml-xerces/java/src/org/apache/xerces/impl/io/UTF8Reader.java,v retrieving revision 1.7 retrieving revision 1.8 diff -u -r1.7 -r1.8 --- UTF8Reader.java 19 Aug 2003 19:06:14 -0000 1.7 +++ UTF8Reader.java 22 Jan 2004 20:41:21 -0000 1.8 @@ -2,7 +2,7 @@ * The Apache Software License, Version 1.1 * * - * Copyright (c) 2000-2002 The Apache Software Foundation. All rights + * Copyright (c) 2000-2004 The Apache Software Foundation. All rights * reserved. * * Redistribution and use in source and binary forms, with or without @@ -194,7 +194,7 @@ // UTF-8: [110y yyyy] [10xx xxxx] // Unicode: [0000 0yyy] [yyxx xxxx] - else if ((b0 & 0xE0) == 0xC0) { + else if ((b0 & 0xE0) == 0xC0 && (b0 & 0x1E) != 0) { int b1 = index == fOffset ? fInputStream.read() : fBuffer[index++] & 0x00FF; if (b1 == -1) { @@ -214,7 +214,9 @@ if (b1 == -1) { expectedByte(2, 3); } - if ((b1 & 0xC0) != 0x80 || (b0 == 0xED && b1 >= 0xA0)) { + if ((b1 & 0xC0) != 0x80 + || (b0 == 0xED && b1 >= 0xA0) + || ((b0 & 0x0F) == 0 && (b1 & 0x20) == 0)) { invalidByte(2, 3, b1); } int b2 = index == fOffset @@ -239,7 +241,8 @@ if (b1 == -1) { expectedByte(2, 4); } - if ((b1 & 0xC0) != 0x80) { + if ((b1 & 0xC0) != 0x80 + || ((b1 & 0x30) == 0 && (b0 & 0x07) == 0)) { invalidByte(2, 3, b1); } int b2 = index == fOffset @@ -370,7 +373,7 @@ // UTF-8: [110y yyyy] [10xx xxxx] // Unicode: [0000 0yyy] [yyxx xxxx] int b0 = byte1 & 0x0FF; - if ((b0 & 0xE0) == 0xC0) { + if ((b0 & 0xE0) == 0xC0 && (b0 & 0x1E) != 0) { int b1 = -1; if (++in < total) { b1 = fBuffer[in] & 0x00FF; @@ -421,7 +424,9 @@ } count++; } - if ((b1 & 0xC0) != 0x80 || (b0 == 0xED && b1 >= 0xA0)) { + if ((b1 & 0xC0) != 0x80 + || (b0 == 0xED && b1 >= 0xA0) + || ((b0 & 0x0F) == 0 && (b1 & 0x20) == 0)) { if (out > offset) { fBuffer[0] = (byte)b0; fBuffer[1] = (byte)b1; @@ -485,7 +490,8 @@ } count++; } - if ((b1 & 0xC0) != 0x80) { + if ((b1 & 0xC0) != 0x80 + || ((b1 & 0x30) == 0 && (b0 & 0x07) == 0)) { if (out > offset) { fBuffer[0] = (byte)b0; fBuffer[1] = (byte)b1;
--------------------------------------------------------------------- To unsubscribe, e-mail: [EMAIL PROTECTED] For additional commands, e-mail: [EMAIL PROTECTED]