io UTF8Reader.java

mrglavas Thu, 22 Jan 2004 13:42:43 -0800

mrglavas    2004/01/22 12:41:21

  Modified:    java/src/org/apache/xerces/impl/io UTF8Reader.java
  Log:
  Fixing Bug #24579:
  http://nagoya.apache.org/bugzilla/show_bug.cgi?id=24579
  
  XML 1.0 SE - E27. According to Unicode 3.1 conformant UTF-8
  interpreters must reject non-shortest form byte sequences.
  Some examples are C0 80 and E0 80 80, bothl corresponding
  to codepoint 0.
  
  Extra checks are required for all multi-byte sequences. Currently
  this is done with anding the bytes with a mask. We should revisit
  this code to check whether it would be faster to check the value
  of the character after the bytes have been combined.
  
  Revision  Changes    Path
  1.8       +14 -8     xml-xerces/java/src/org/apache/xerces/impl/io/UTF8Reader.java
  
  Index: UTF8Reader.java
  ===================================================================
  RCS file: /home/cvs/xml-xerces/java/src/org/apache/xerces/impl/io/UTF8Reader.java,v
  retrieving revision 1.7
  retrieving revision 1.8
  diff -u -r1.7 -r1.8
  --- UTF8Reader.java   19 Aug 2003 19:06:14 -0000      1.7
  +++ UTF8Reader.java   22 Jan 2004 20:41:21 -0000      1.8
  @@ -2,7 +2,7 @@
    * The Apache Software License, Version 1.1
    *
    *
  - * Copyright (c) 2000-2002 The Apache Software Foundation.  All rights
  + * Copyright (c) 2000-2004 The Apache Software Foundation.  All rights
    * reserved.
    *
    * Redistribution and use in source and binary forms, with or without
  @@ -194,7 +194,7 @@
   
               // UTF-8:   [110y yyyy] [10xx xxxx]
               // Unicode: [0000 0yyy] [yyxx xxxx]
  -            else if ((b0 & 0xE0) == 0xC0) {
  +            else if ((b0 & 0xE0) == 0xC0 && (b0 & 0x1E) != 0) {
                   int b1 = index == fOffset
                          ? fInputStream.read() : fBuffer[index++] & 0x00FF;
                   if (b1 == -1) {
  @@ -214,7 +214,9 @@
                   if (b1 == -1) {
                       expectedByte(2, 3);
                   }
  -                if ((b1 & 0xC0) != 0x80 || (b0 == 0xED && b1 >= 0xA0)) {
  +                if ((b1 & 0xC0) != 0x80 
  +                    || (b0 == 0xED && b1 >= 0xA0)
  +                    || ((b0 & 0x0F) == 0 && (b1 & 0x20) == 0)) {
                       invalidByte(2, 3, b1);
                   }
                   int b2 = index == fOffset
  @@ -239,7 +241,8 @@
                   if (b1 == -1) {
                       expectedByte(2, 4);
                   }
  -                if ((b1 & 0xC0) != 0x80) {
  +                if ((b1 & 0xC0) != 0x80
  +                    || ((b1 & 0x30) == 0 && (b0 & 0x07) == 0)) {
                       invalidByte(2, 3, b1);
                   }
                   int b2 = index == fOffset
  @@ -370,7 +373,7 @@
               // UTF-8:   [110y yyyy] [10xx xxxx]
               // Unicode: [0000 0yyy] [yyxx xxxx]
               int b0 = byte1 & 0x0FF;
  -            if ((b0 & 0xE0) == 0xC0) {
  +            if ((b0 & 0xE0) == 0xC0 && (b0 & 0x1E) != 0) {
                   int b1 = -1;
                   if (++in < total) {
                       b1 = fBuffer[in] & 0x00FF;
  @@ -421,7 +424,9 @@
                       }
                       count++;
                   }
  -                if ((b1 & 0xC0) != 0x80 || (b0 == 0xED && b1 >= 0xA0)) {
  +                if ((b1 & 0xC0) != 0x80 
  +                    || (b0 == 0xED && b1 >= 0xA0)
  +                    || ((b0 & 0x0F) == 0 && (b1 & 0x20) == 0)) {
                       if (out > offset) {
                           fBuffer[0] = (byte)b0;
                           fBuffer[1] = (byte)b1;
  @@ -485,7 +490,8 @@
                       }
                       count++;
                   }
  -                if ((b1 & 0xC0) != 0x80) {
  +                if ((b1 & 0xC0) != 0x80
  +                    || ((b1 & 0x30) == 0 && (b0 & 0x07) == 0)) {
                       if (out > offset) {
                           fBuffer[0] = (byte)b0;
                           fBuffer[1] = (byte)b1;


---------------------------------------------------------------------
To unsubscribe, e-mail: [EMAIL PROTECTED]
For additional commands, e-mail: [EMAIL PROTECTED]

cvs commit: xml-xerces/java/src/org/apache/xerces/impl/io UTF8Reader.java

Reply via email to