costin      01/05/26 10:07:31

  Added:       src/share/org/apache/tomcat/util/buf UDecoder.java
                        UEncoder.java
  Log:
  Added ( refactored ) UTF encoder and decoder.
  
  The code used to be part of Byte/Char Chunk, but had many bugs and it was hard
  to optimize.
  
  Note that we don't implement M$ encoding scheme ( which is not standard and
  may cause many problems ), but it could be implemented.
  
  There is still work to be done for decoding char[] - the result of the
  conversion is byte, and it has to be converted ( somehow ) to char, but
  you can't do that without a b->c converter.
  
  ( this will happen for RequestDispatchers for example - a workaround is to
  not encode "extended" chars )
  
  Revision  Changes    Path
  1.1                  
jakarta-tomcat/src/share/org/apache/tomcat/util/buf/UDecoder.java
  
  Index: UDecoder.java
  ===================================================================
  /*
   * ====================================================================
   *
   * The Apache Software License, Version 1.1
   *
   * Copyright (c) 1999 The Apache Software Foundation.  All rights 
   * reserved.
   *
   * Redistribution and use in source and binary forms, with or without
   * modification, are permitted provided that the following conditions
   * are met:
   *
   * 1. Redistributions of source code must retain the above copyright
   *    notice, this list of conditions and the following disclaimer. 
   *
   * 2. Redistributions in binary form must reproduce the above copyright
   *    notice, this list of conditions and the following disclaimer in
   *    the documentation and/or other materials provided with the
   *    distribution.
   *
   * 3. The end-user documentation included with the redistribution, if
   *    any, must include the following acknowlegement:  
   *       "This product includes software developed by the 
   *        Apache Software Foundation (http://www.apache.org/)."
   *    Alternately, this acknowlegement may appear in the software itself,
   *    if and wherever such third-party acknowlegements normally appear.
   *
   * 4. The names "The Jakarta Project", "Tomcat", and "Apache Software
   *    Foundation" must not be used to endorse or promote products derived
   *    from this software without prior written permission. For written 
   *    permission, please contact [EMAIL PROTECTED]
   *
   * 5. Products derived from this software may not be called "Apache"
   *    nor may "Apache" appear in their names without prior written
   *    permission of the Apache Group.
   *
   * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED
   * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
   * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
   * DISCLAIMED.  IN NO EVENT SHALL THE APACHE SOFTWARE FOUNDATION OR
   * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
   * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
   * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
   * USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
   * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
   * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
   * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
   * SUCH DAMAGE.
   * ====================================================================
   *
   * This software consists of voluntary contributions made by many
   * individuals on behalf of the Apache Software Foundation.  For more
   * information on the Apache Software Foundation, please see
   * <http://www.apache.org/>.
   *
   * [Additional notices, if required by prior licensing conditions]
   *
   */ 
  
  package org.apache.tomcat.util.buf;
  
  import org.apache.tomcat.util.buf.*;
  
  import java.util.BitSet;
  import java.io.*;
  
  /** 
   *  All URL decoding happens here. This way we can reuse, review, optimize
   *  without adding complexity to the buffers.
   *
   *  The conversion will modify the original buffer.
   * 
   *  @author Costin Manolache
   */
  public final class UDecoder {
      
      public UDecoder() 
      {
      }
  
      /** URLDecode, will modify the source.
       */
      public void convert(ByteChunk mb)
        throws IOException
      {
        int start=mb.getOffset();
        byte buff[]=mb.getBytes();
        int end=mb.getEnd();
  
        int idx= mb.indexOf( buff, start, end, '%' );
        int idx2= mb.indexOf( buff, start, end, '+' );
        if( idx<0 && idx2<0 ) {
            return;
        }
        
        if( idx2 >= 0 && idx2 < idx ) idx=idx2; 
        
        for( int j=idx; j<end; j++, idx++ ) {
            if( buff[ j ] == '+' ) {
                buff[idx]= (byte)' ' ;
            } else if( buff[ j ] != '%' ) {
                buff[idx]= buff[j];
            } else {
                // read next 2 digits
                if( j+2 >= end ) {
                    throw new CharConversionException("EOF");
                }
                byte b1= buff[j+1];
                byte b2=buff[j+2];
                if( !isHexDigit( b1 ) || ! isHexDigit(b2 ))
                    throw new CharConversionException( "isHexDigit");
                
                j+=2;
                int res=x2c( b1, b2 );
                buff[idx]=(byte)res;
            }
        }
  
        mb.setEnd( idx );
  
        return;
      }
  
      // -------------------- Additional methods --------------------
      // XXX What do we do about charset ????
  
      /** In-buffer processing - the buffer will be modified
       */
      public void convert( CharChunk mb )
        throws IOException
      {
        log( "Converting a char chunk ");
        int start=mb.getOffset();
        char buff[]=mb.getBuffer();
        int cend=mb.getEnd();
  
        int idx= mb.indexOf( buff, start, cend, '%' );
        int idx2= mb.indexOf( buff, start, cend, '+' );
        if( idx<0 && idx2<0 ) {
            return;
        }
        
        if( idx2 >= 0 && idx2 < idx ) idx=idx2; 
  
        for( int j=idx; j<cend; j++, idx++ ) {
            if( buff[ j ] == '+' ) {
                buff[idx]=( ' ' );
            } else if( buff[ j ] != '%' ) {
                buff[idx]=buff[j];
            } else {
                // read next 2 digits
                if( j+2 >= cend ) {
                    // invalid
                    throw new CharConversionException("EOF");
                }
                char b1= buff[j+1];
                char b2=buff[j+2];
                if( !isHexDigit( b1 ) || ! isHexDigit(b2 ))
                    throw new CharConversionException("isHexDigit");
                
                j+=2;
                int res=x2c( b1, b2 );
                buff[idx]=(char)res;
            }
        }
        mb.setEnd( idx );
      }
  
      /** URLDecode, will modify the source
       */
      public void convert(MessageBytes mb)
        throws IOException
      {
        
        switch (mb.getType()) {
        case MessageBytes.T_STR:
            String strValue=mb.toString();
            if( strValue==null ) return;
            mb.setString( convert( strValue ));
            break;
        case MessageBytes.T_CHARS:
            CharChunk charC=mb.getCharChunk();
            convert( charC );
            break;
        case MessageBytes.T_BYTES:
            ByteChunk bytesC=mb.getByteChunk();
            convert( bytesC );
            break;
        }
      }
  
      // XXX Old code, needs to be replaced !!!!
      // 
      public final String convert(String str)
      {
          if (str == null)  return  null;
        
        if( str.indexOf( '+' ) <0 && str.indexOf( '%' ) < 0 )
            return str;
        
          StringBuffer dec = new StringBuffer();    // decoded string output
          int strPos = 0;
          int strLen = str.length();
  
          dec.ensureCapacity(str.length());
          while (strPos < strLen) {
              int laPos;        // lookahead position
  
              // look ahead to next URLencoded metacharacter, if any
              for (laPos = strPos; laPos < strLen; laPos++) {
                  char laChar = str.charAt(laPos);
                  if ((laChar == '+') || (laChar == '%')) {
                      break;
                  }
              }
  
              // if there were non-metacharacters, copy them all as a block
              if (laPos > strPos) {
                  dec.append(str.substring(strPos,laPos));
                  strPos = laPos;
              }
  
              // shortcut out of here if we're at the end of the string
              if (strPos >= strLen) {
                  break;
              }
  
              // process next metacharacter
              char metaChar = str.charAt(strPos);
              if (metaChar == '+') {
                  dec.append(' ');
                  strPos++;
                  continue;
              } else if (metaChar == '%') {
                // We throw the original exception - the super will deal with
                // it
                //                try {
                dec.append((char)Integer.
                           parseInt(str.substring(strPos + 1, strPos + 3),16));
                  strPos += 3;
              }
          }
  
          return dec.toString();
      }
  
  
  
      private static boolean isHexDigit( int c ) {
        return ( ( c>='0' && c<='9' ) ||
                 ( c>='a' && c<='f' ) ||
                 ( c>='A' && c<='F' ));
      }
      
      private static int x2c( byte b1, byte b2 ) {
        int digit= (b1>='A') ? ( (b1 & 0xDF)-'A') + 10 :
            (b1 -'0');
        digit*=16;
        digit +=(b2>='A') ? ( (b2 & 0xDF)-'A') + 10 :
            (b2 -'0');
        return digit;
      }
  
      private static int x2c( char b1, char b2 ) {
        int digit= (b1>='A') ? ( (b1 & 0xDF)-'A') + 10 :
            (b1 -'0');
        digit*=16;
        digit +=(b2>='A') ? ( (b2 & 0xDF)-'A') + 10 :
            (b2 -'0');
        return digit;
      }
  
      private final static int debug=0;
      private static void log( String s ) {
        System.out.println("URLDecoder: " + s );
      }
  
  }
  
  
  
  1.1                  
jakarta-tomcat/src/share/org/apache/tomcat/util/buf/UEncoder.java
  
  Index: UEncoder.java
  ===================================================================
  /*
   * ====================================================================
   *
   * The Apache Software License, Version 1.1
   *
   * Copyright (c) 1999 The Apache Software Foundation.  All rights 
   * reserved.
   *
   * Redistribution and use in source and binary forms, with or without
   * modification, are permitted provided that the following conditions
   * are met:
   *
   * 1. Redistributions of source code must retain the above copyright
   *    notice, this list of conditions and the following disclaimer. 
   *
   * 2. Redistributions in binary form must reproduce the above copyright
   *    notice, this list of conditions and the following disclaimer in
   *    the documentation and/or other materials provided with the
   *    distribution.
   *
   * 3. The end-user documentation included with the redistribution, if
   *    any, must include the following acknowlegement:  
   *       "This product includes software developed by the 
   *        Apache Software Foundation (http://www.apache.org/)."
   *    Alternately, this acknowlegement may appear in the software itself,
   *    if and wherever such third-party acknowlegements normally appear.
   *
   * 4. The names "The Jakarta Project", "Tomcat", and "Apache Software
   *    Foundation" must not be used to endorse or promote products derived
   *    from this software without prior written permission. For written 
   *    permission, please contact [EMAIL PROTECTED]
   *
   * 5. Products derived from this software may not be called "Apache"
   *    nor may "Apache" appear in their names without prior written
   *    permission of the Apache Group.
   *
   * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED
   * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
   * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
   * DISCLAIMED.  IN NO EVENT SHALL THE APACHE SOFTWARE FOUNDATION OR
   * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
   * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
   * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
   * USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
   * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
   * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
   * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
   * SUCH DAMAGE.
   * ====================================================================
   *
   * This software consists of voluntary contributions made by many
   * individuals on behalf of the Apache Software Foundation.  For more
   * information on the Apache Software Foundation, please see
   * <http://www.apache.org/>.
   *
   * [Additional notices, if required by prior licensing conditions]
   *
   */ 
  
  
  package org.apache.tomcat.util.buf;
  
  import org.apache.tomcat.util.buf.*;
  import java.util.BitSet;
  import java.io.*;
  
  /** Efficient implementation for encoders.
   *  This class is not thread safe - you need one encoder per thread.
   *  The encoder will save and recycle the internal objects, avoiding
   *  garbage.
   * 
   *  You can add extra characters that you want preserved, for example
   *  while encoding a URL you can add "/".
   *
   *  @author Costin Manolache
   */
  public final class UEncoder {
  
      // Not static - the set may differ ( it's better than adding
      // an extra check for "/", "+", etc
      private BitSet safeChars=null;
      private C2BConverter c2b=null;
      private ByteChunk bb=null;
  
      private String encoding="UTF8";
      private static final int debug=0;
      
      public UEncoder() {
        initSafeChars();
      }
  
      public void setEncoding( String s ) {
        encoding=s;
      }
  
      public void addSafeCharacter( char c ) {
        safeChars.set( c );
      }
  
  
      /** URL Encode string, using a specified encoding.
       *  @param s string to be encoded
       *  @param enc character encoding, for chars >%80 ( use UTF8 if not set,
       *         as recommended in RFCs)
       *  @param reserved extra characters to preserve ( "/" - if s is a URL )
       */
      public void urlEncode( Writer buf, String s )
        throws IOException
      {
        if( c2b==null ) {
            bb=new ByteChunk(16); // small enough.
            c2b=new C2BConverter( bb, encoding );
        }
  
        for (int i = 0; i < s.length(); i++) {
            int c = (int) s.charAt(i);
            if( safeChars.get( c ) ) {
                if( debug > 0 ) log("Safe: " + (char)c);
                buf.write((char)c);
            } else {
                if( debug > 0 ) log("Unsafe:  " + (char)c);
                c2b.convert( (char)c );
                
                // "surrogate" - UTF is _not_ 16 bit, but 21 !!!!
                // ( while UCS is 31 ). Amazing...
                if (c >= 0xD800 && c <= 0xDBFF) {
                    if ( (i+1) < s.length()) {
                        int d = (int) s.charAt(i+1);
                        if (d >= 0xDC00 && d <= 0xDFFF) {
                            if( debug > 0 ) log("Unsafe:  " + c);
                            c2b.convert( (char)d);
                            i++;
                        }
                    }
                }
  
                c2b.flushBuffer();
                
                urlEncode( buf, bb.getBuffer(), bb.getOffset(),
                           bb.getLength() );
                bb.recycle();
            }
        }
      }
  
      /**
       */
      public void urlEncode( Writer buf, byte bytes[], int off, int len)
        throws IOException
      {
        for( int j=off; j< len; j++ ) {
            buf.write( '%' );
            char ch = Character.forDigit((bytes[j] >> 4) & 0xF, 16);
            if( debug > 0 ) log("Encode:  " + ch);
            buf.write(ch);
            ch = Character.forDigit(bytes[j] & 0xF, 16);
            if( debug > 0 ) log("Encode:  " + ch);
            buf.write(ch);
        }
      }
  
  
  
      // -------------------- Internal implementation --------------------
      
      // 
      private void init() {
        
      }
      
      private void initSafeChars() {
        safeChars=new BitSet(128);
        int i;
        for (i = 'a'; i <= 'z'; i++) {
            safeChars.set(i);
        }
        for (i = 'A'; i <= 'Z'; i++) {
            safeChars.set(i);
        }
        for (i = '0'; i <= '9'; i++) {
            safeChars.set(i);
        }
        //safe
        safeChars.set('$');
        safeChars.set('-');
        safeChars.set('_');
        safeChars.set('.');
  
        // Dangerous: someone may treat this as " "
        // RFC1738 does allow it, it's not reserved
        //    safeChars.set('+');
        //extra
        safeChars.set('!');
        safeChars.set('*');
        safeChars.set('\'');
        safeChars.set('(');
        safeChars.set(')');
        safeChars.set(',');     
      }
  
      private static void log( String s ) {
        System.out.println("Encoder: " + s );
      }
  }
  
  
  

Reply via email to