costin 01/05/26 10:07:31 Added: src/share/org/apache/tomcat/util/buf UDecoder.java UEncoder.java Log: Added ( refactored ) UTF encoder and decoder. The code used to be part of Byte/Char Chunk, but had many bugs and it was hard to optimize. Note that we don't implement M$ encoding scheme ( which is not standard and may cause many problems ), but it could be implemented. There is still work to be done for decoding char[] - the result of the conversion is byte, and it has to be converted ( somehow ) to char, but you can't do that without a b->c converter. ( this will happen for RequestDispatchers for example - a workaround is to not encode "extended" chars ) Revision Changes Path 1.1 jakarta-tomcat/src/share/org/apache/tomcat/util/buf/UDecoder.java Index: UDecoder.java =================================================================== /* * ==================================================================== * * The Apache Software License, Version 1.1 * * Copyright (c) 1999 The Apache Software Foundation. All rights * reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in * the documentation and/or other materials provided with the * distribution. * * 3. The end-user documentation included with the redistribution, if * any, must include the following acknowlegement: * "This product includes software developed by the * Apache Software Foundation (http://www.apache.org/)." * Alternately, this acknowlegement may appear in the software itself, * if and wherever such third-party acknowlegements normally appear. * * 4. The names "The Jakarta Project", "Tomcat", and "Apache Software * Foundation" must not be used to endorse or promote products derived * from this software without prior written permission. For written * permission, please contact [EMAIL PROTECTED] * * 5. Products derived from this software may not be called "Apache" * nor may "Apache" appear in their names without prior written * permission of the Apache Group. * * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE * DISCLAIMED. IN NO EVENT SHALL THE APACHE SOFTWARE FOUNDATION OR * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF * USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * ==================================================================== * * This software consists of voluntary contributions made by many * individuals on behalf of the Apache Software Foundation. For more * information on the Apache Software Foundation, please see * <http://www.apache.org/>. * * [Additional notices, if required by prior licensing conditions] * */ package org.apache.tomcat.util.buf; import org.apache.tomcat.util.buf.*; import java.util.BitSet; import java.io.*; /** * All URL decoding happens here. This way we can reuse, review, optimize * without adding complexity to the buffers. * * The conversion will modify the original buffer. * * @author Costin Manolache */ public final class UDecoder { public UDecoder() { } /** URLDecode, will modify the source. */ public void convert(ByteChunk mb) throws IOException { int start=mb.getOffset(); byte buff[]=mb.getBytes(); int end=mb.getEnd(); int idx= mb.indexOf( buff, start, end, '%' ); int idx2= mb.indexOf( buff, start, end, '+' ); if( idx<0 && idx2<0 ) { return; } if( idx2 >= 0 && idx2 < idx ) idx=idx2; for( int j=idx; j<end; j++, idx++ ) { if( buff[ j ] == '+' ) { buff[idx]= (byte)' ' ; } else if( buff[ j ] != '%' ) { buff[idx]= buff[j]; } else { // read next 2 digits if( j+2 >= end ) { throw new CharConversionException("EOF"); } byte b1= buff[j+1]; byte b2=buff[j+2]; if( !isHexDigit( b1 ) || ! isHexDigit(b2 )) throw new CharConversionException( "isHexDigit"); j+=2; int res=x2c( b1, b2 ); buff[idx]=(byte)res; } } mb.setEnd( idx ); return; } // -------------------- Additional methods -------------------- // XXX What do we do about charset ???? /** In-buffer processing - the buffer will be modified */ public void convert( CharChunk mb ) throws IOException { log( "Converting a char chunk "); int start=mb.getOffset(); char buff[]=mb.getBuffer(); int cend=mb.getEnd(); int idx= mb.indexOf( buff, start, cend, '%' ); int idx2= mb.indexOf( buff, start, cend, '+' ); if( idx<0 && idx2<0 ) { return; } if( idx2 >= 0 && idx2 < idx ) idx=idx2; for( int j=idx; j<cend; j++, idx++ ) { if( buff[ j ] == '+' ) { buff[idx]=( ' ' ); } else if( buff[ j ] != '%' ) { buff[idx]=buff[j]; } else { // read next 2 digits if( j+2 >= cend ) { // invalid throw new CharConversionException("EOF"); } char b1= buff[j+1]; char b2=buff[j+2]; if( !isHexDigit( b1 ) || ! isHexDigit(b2 )) throw new CharConversionException("isHexDigit"); j+=2; int res=x2c( b1, b2 ); buff[idx]=(char)res; } } mb.setEnd( idx ); } /** URLDecode, will modify the source */ public void convert(MessageBytes mb) throws IOException { switch (mb.getType()) { case MessageBytes.T_STR: String strValue=mb.toString(); if( strValue==null ) return; mb.setString( convert( strValue )); break; case MessageBytes.T_CHARS: CharChunk charC=mb.getCharChunk(); convert( charC ); break; case MessageBytes.T_BYTES: ByteChunk bytesC=mb.getByteChunk(); convert( bytesC ); break; } } // XXX Old code, needs to be replaced !!!! // public final String convert(String str) { if (str == null) return null; if( str.indexOf( '+' ) <0 && str.indexOf( '%' ) < 0 ) return str; StringBuffer dec = new StringBuffer(); // decoded string output int strPos = 0; int strLen = str.length(); dec.ensureCapacity(str.length()); while (strPos < strLen) { int laPos; // lookahead position // look ahead to next URLencoded metacharacter, if any for (laPos = strPos; laPos < strLen; laPos++) { char laChar = str.charAt(laPos); if ((laChar == '+') || (laChar == '%')) { break; } } // if there were non-metacharacters, copy them all as a block if (laPos > strPos) { dec.append(str.substring(strPos,laPos)); strPos = laPos; } // shortcut out of here if we're at the end of the string if (strPos >= strLen) { break; } // process next metacharacter char metaChar = str.charAt(strPos); if (metaChar == '+') { dec.append(' '); strPos++; continue; } else if (metaChar == '%') { // We throw the original exception - the super will deal with // it // try { dec.append((char)Integer. parseInt(str.substring(strPos + 1, strPos + 3),16)); strPos += 3; } } return dec.toString(); } private static boolean isHexDigit( int c ) { return ( ( c>='0' && c<='9' ) || ( c>='a' && c<='f' ) || ( c>='A' && c<='F' )); } private static int x2c( byte b1, byte b2 ) { int digit= (b1>='A') ? ( (b1 & 0xDF)-'A') + 10 : (b1 -'0'); digit*=16; digit +=(b2>='A') ? ( (b2 & 0xDF)-'A') + 10 : (b2 -'0'); return digit; } private static int x2c( char b1, char b2 ) { int digit= (b1>='A') ? ( (b1 & 0xDF)-'A') + 10 : (b1 -'0'); digit*=16; digit +=(b2>='A') ? ( (b2 & 0xDF)-'A') + 10 : (b2 -'0'); return digit; } private final static int debug=0; private static void log( String s ) { System.out.println("URLDecoder: " + s ); } } 1.1 jakarta-tomcat/src/share/org/apache/tomcat/util/buf/UEncoder.java Index: UEncoder.java =================================================================== /* * ==================================================================== * * The Apache Software License, Version 1.1 * * Copyright (c) 1999 The Apache Software Foundation. All rights * reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in * the documentation and/or other materials provided with the * distribution. * * 3. The end-user documentation included with the redistribution, if * any, must include the following acknowlegement: * "This product includes software developed by the * Apache Software Foundation (http://www.apache.org/)." * Alternately, this acknowlegement may appear in the software itself, * if and wherever such third-party acknowlegements normally appear. * * 4. The names "The Jakarta Project", "Tomcat", and "Apache Software * Foundation" must not be used to endorse or promote products derived * from this software without prior written permission. For written * permission, please contact [EMAIL PROTECTED] * * 5. Products derived from this software may not be called "Apache" * nor may "Apache" appear in their names without prior written * permission of the Apache Group. * * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE * DISCLAIMED. IN NO EVENT SHALL THE APACHE SOFTWARE FOUNDATION OR * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF * USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * ==================================================================== * * This software consists of voluntary contributions made by many * individuals on behalf of the Apache Software Foundation. For more * information on the Apache Software Foundation, please see * <http://www.apache.org/>. * * [Additional notices, if required by prior licensing conditions] * */ package org.apache.tomcat.util.buf; import org.apache.tomcat.util.buf.*; import java.util.BitSet; import java.io.*; /** Efficient implementation for encoders. * This class is not thread safe - you need one encoder per thread. * The encoder will save and recycle the internal objects, avoiding * garbage. * * You can add extra characters that you want preserved, for example * while encoding a URL you can add "/". * * @author Costin Manolache */ public final class UEncoder { // Not static - the set may differ ( it's better than adding // an extra check for "/", "+", etc private BitSet safeChars=null; private C2BConverter c2b=null; private ByteChunk bb=null; private String encoding="UTF8"; private static final int debug=0; public UEncoder() { initSafeChars(); } public void setEncoding( String s ) { encoding=s; } public void addSafeCharacter( char c ) { safeChars.set( c ); } /** URL Encode string, using a specified encoding. * @param s string to be encoded * @param enc character encoding, for chars >%80 ( use UTF8 if not set, * as recommended in RFCs) * @param reserved extra characters to preserve ( "/" - if s is a URL ) */ public void urlEncode( Writer buf, String s ) throws IOException { if( c2b==null ) { bb=new ByteChunk(16); // small enough. c2b=new C2BConverter( bb, encoding ); } for (int i = 0; i < s.length(); i++) { int c = (int) s.charAt(i); if( safeChars.get( c ) ) { if( debug > 0 ) log("Safe: " + (char)c); buf.write((char)c); } else { if( debug > 0 ) log("Unsafe: " + (char)c); c2b.convert( (char)c ); // "surrogate" - UTF is _not_ 16 bit, but 21 !!!! // ( while UCS is 31 ). Amazing... if (c >= 0xD800 && c <= 0xDBFF) { if ( (i+1) < s.length()) { int d = (int) s.charAt(i+1); if (d >= 0xDC00 && d <= 0xDFFF) { if( debug > 0 ) log("Unsafe: " + c); c2b.convert( (char)d); i++; } } } c2b.flushBuffer(); urlEncode( buf, bb.getBuffer(), bb.getOffset(), bb.getLength() ); bb.recycle(); } } } /** */ public void urlEncode( Writer buf, byte bytes[], int off, int len) throws IOException { for( int j=off; j< len; j++ ) { buf.write( '%' ); char ch = Character.forDigit((bytes[j] >> 4) & 0xF, 16); if( debug > 0 ) log("Encode: " + ch); buf.write(ch); ch = Character.forDigit(bytes[j] & 0xF, 16); if( debug > 0 ) log("Encode: " + ch); buf.write(ch); } } // -------------------- Internal implementation -------------------- // private void init() { } private void initSafeChars() { safeChars=new BitSet(128); int i; for (i = 'a'; i <= 'z'; i++) { safeChars.set(i); } for (i = 'A'; i <= 'Z'; i++) { safeChars.set(i); } for (i = '0'; i <= '9'; i++) { safeChars.set(i); } //safe safeChars.set('$'); safeChars.set('-'); safeChars.set('_'); safeChars.set('.'); // Dangerous: someone may treat this as " " // RFC1738 does allow it, it's not reserved // safeChars.set('+'); //extra safeChars.set('!'); safeChars.set('*'); safeChars.set('\''); safeChars.set('('); safeChars.set(')'); safeChars.set(','); } private static void log( String s ) { System.out.println("Encoder: " + s ); } }