serialize Encodings.java EncodingInfo.java OutputFormat.java SieveEncodingInfo.java

neilg Fri, 03 Jan 2003 08:13:39 -0800

neilg       2003/01/03 08:13:27

  Modified:    java/src/org/apache/xml/serialize Encodings.java
                        EncodingInfo.java OutputFormat.java
  Removed:     java/src/org/apache/xml/serialize SieveEncodingInfo.java
  Log:
  Rework handling of encodings within the serializer.  Now it should
  be able to support the intersection of the IANA encodings that
  Xerces recognizes and those for which the JDK has a CharToByte converter.
  To maximize portability of serialized documents, by default the serializer
  will only accept IANA names for encodings now.
  
  Revision  Changes    Path
  1.6       +68 -42    xml-xerces/java/src/org/apache/xml/serialize/Encodings.java
  
  Index: Encodings.java
  ===================================================================
  RCS file: /home/cvs/xml-xerces/java/src/org/apache/xml/serialize/Encodings.java,v
  retrieving revision 1.5
  retrieving revision 1.6
  diff -u -r1.5 -r1.6
  --- Encodings.java    29 Jan 2002 01:15:20 -0000      1.5
  +++ Encodings.java    3 Jan 2003 16:13:27 -0000       1.6
  @@ -63,6 +63,8 @@
   import java.io.OutputStream;
   import java.io.OutputStreamWriter;
   import java.io.UnsupportedEncodingException;
  +import java.util.Hashtable;
  +import org.apache.xerces.util.EncodingMap;
   
   
   /**
  @@ -81,19 +83,76 @@
       /**
        * The last printable character for unknown encodings.
        */
  -    static final int DefaultLastPrintable = 0x7F;
  +    static final int DEFAULT_LAST_PRINTABLE = 0x7F;
  +
  +    // last printable character for Unicode-compatible encodings
  +    static final int LAST_PRINTABLE_UNICODE = 0xffff;
  +    // unicode-compliant encodings; can express plane 0
  +    static final String[] UNICODE_ENCODINGS = {
  +        "Unicode", "UnicodeBig", "UnicodeLittle", "GB2312", "UTF8", 
  +    };
  +    // default (Java) encoding if none supplied:
  +    static final String DEFAULT_ENCODING = "UTF8";
  +
  +    // note that the size of this Hashtable
  +    // is bounded by the number of encodings recognized by EncodingMap;
  +    // therefore it poses no static mutability risk.
  +    static Hashtable _encodings = new Hashtable();
   
       /**
        * @param encoding a MIME charset name, or null.
        */
  -    static EncodingInfo getEncodingInfo(String encoding) {
  -        if (encoding == null)
  -            return new EncodingInfo(null, DefaultLastPrintable);
  -        for (int i = 0;  i < _encodings.length;  i++) {
  -            if (_encodings[i].name.equalsIgnoreCase(encoding))
  -                return _encodings[i];
  +    static EncodingInfo getEncodingInfo(String encoding, boolean allowJavaNames) 
throws UnsupportedEncodingException {
  +        EncodingInfo eInfo = null;
  +        if (encoding == null) {
  +            if((eInfo = (EncodingInfo)_encodings.get(DEFAULT_ENCODING)) != null) 
  +                return eInfo;
  +            eInfo = new 
EncodingInfo(EncodingMap.getJava2IANAMapping(DEFAULT_ENCODING), DEFAULT_ENCODING, 
LAST_PRINTABLE_UNICODE);
  +            _encodings.put(DEFAULT_ENCODING, eInfo);
  +            return eInfo;
  +        }
  +        // need to convert it to upper case:
  +        encoding = encoding.toUpperCase();
  +        String jName = EncodingMap.getIANA2JavaMapping(encoding);
  +        if(jName == null) {
  +            // see if the encoding passed in is a Java encoding name.
  +            if(allowJavaNames ) {
  +                EncodingInfo.testJavaEncodingName(encoding);
  +                if((eInfo = (EncodingInfo)_encodings.get(encoding)) != null) 
  +                    return eInfo;
  +                // is it known to be unicode-compliant?
  +                int i=0;
  +                for(; i<UNICODE_ENCODINGS.length; i++) {
  +                    if(UNICODE_ENCODINGS[i].equalsIgnoreCase(encoding)) {
  +                        eInfo = new 
EncodingInfo(EncodingMap.getJava2IANAMapping(encoding), encoding, 
LAST_PRINTABLE_UNICODE);
  +                        break;
  +                    }
  +                }
  +                if(i == UNICODE_ENCODINGS.length) {
  +                    eInfo = new 
EncodingInfo(EncodingMap.getJava2IANAMapping(encoding), encoding, 
DEFAULT_LAST_PRINTABLE);
  +                }
  +                _encodings.put(encoding, eInfo); 
  +                return eInfo;
  +            } else {
  +                throw new UnsupportedEncodingException(encoding);
  +            }
  +        }
  +        if ((eInfo = (EncodingInfo)_encodings.get(jName)) != null)
  +            return eInfo;
  +        // have to create one...
  +        // is it known to be unicode-compliant?
  +        int i=0;
  +        for(; i<UNICODE_ENCODINGS.length; i++) {
  +            if(UNICODE_ENCODINGS[i].equalsIgnoreCase(jName)) {
  +                eInfo = new EncodingInfo(encoding, jName, LAST_PRINTABLE_UNICODE);
  +                break;
  +            }
           }
  -        return new SieveEncodingInfo(encoding, DefaultLastPrintable);
  +        if(i == UNICODE_ENCODINGS.length) {
  +            eInfo = new EncodingInfo(encoding, jName, DEFAULT_LAST_PRINTABLE);
  +        }
  +        _encodings.put(jName, eInfo); 
  +        return eInfo;
       }
   
       static final String JIS_DANGER_CHARS
  @@ -101,37 +160,4 @@
       +"\u2014\u2015\u2016\u2026\u203e\u203e\u2225\u222f\u301c"
       +"\uff3c\uff5e\uffe0\uffe1\uffe2\uffe3";
   
  -    /**
  -     * Constructs a list of all the supported encodings.
  -     */
  -    private static final EncodingInfo[] _encodings = new EncodingInfo[] {
  -        new EncodingInfo("ASCII", 0x7F),
  -        new EncodingInfo("US-ASCII", 0x7F),
  -        new EncodingInfo("ISO-8859-1", 0xFF),
  -        new EncodingInfo("ISO-8859-2", 0xFF),
  -        new EncodingInfo("ISO-8859-3", 0xFF),
  -        new EncodingInfo("ISO-8859-4", 0xFF),
  -        new EncodingInfo("ISO-8859-5", 0xFF),
  -        new EncodingInfo("ISO-8859-6", 0xFF),
  -        new EncodingInfo("ISO-8859-7", 0xFF),
  -        new EncodingInfo("ISO-8859-8", 0xFF),
  -        new EncodingInfo("ISO-8859-9", 0xFF),
  -        /**
  -         * Does JDK's converter supprt surrogates?
  -         * A Java encoding name "UTF-8" is suppoted by JDK 1.2 or later.
  -         */
  -        new EncodingInfo("UTF-8", "UTF8", 0x10FFFF),
  -        /**
  -         * JDK 1.1 supports "Shift_JIS" as an alias of "SJIS".
  -         * But JDK 1.2 treats "Shift_JIS" as an alias of "MS932".
  -         * The JDK 1.2's behavior is invalid against IANA registrations.
  -         */
  -        new SieveEncodingInfo("Shift_JIS", "SJIS", 0x7F, JIS_DANGER_CHARS),
  -        /**
  -         * "MS932" is supported by JDK 1.2 or later.
  -         */
  -        new SieveEncodingInfo("Windows-31J", "MS932", 0x7F, JIS_DANGER_CHARS),
  -        new SieveEncodingInfo("EUC-JP", null, 0x7F, JIS_DANGER_CHARS),
  -        new SieveEncodingInfo("ISO-2022-JP", null, 0x7F, JIS_DANGER_CHARS),
  -    };
   }
  
  
  
  1.3       +62 -20    xml-xerces/java/src/org/apache/xml/serialize/EncodingInfo.java
  
  Index: EncodingInfo.java
  ===================================================================
  RCS file: /home/cvs/xml-xerces/java/src/org/apache/xml/serialize/EncodingInfo.java,v
  retrieving revision 1.2
  retrieving revision 1.3
  diff -u -r1.2 -r1.3
  --- EncodingInfo.java 29 Jan 2002 01:15:20 -0000      1.2
  +++ EncodingInfo.java 3 Jan 2003 16:13:27 -0000       1.3
  @@ -62,6 +62,8 @@
   import java.io.OutputStreamWriter;
   import java.io.UnsupportedEncodingException;
   import java.io.Writer;
  +import sun.io.CharToByteConverter;
  +import org.apache.xerces.util.EncodingMap;
   
   /**
    * This class represents an encoding.
  @@ -70,31 +72,33 @@
    */
   public class EncodingInfo {
   
  -    String name;
  +    // name of encoding as registered with IANA;
  +    // preferably a MIME name, but aliases are fine too.
  +    String ianaName;
       String javaName;
       int lastPrintable;
  +    // the charToByteConverter we test unusual characters
  +    // with
  +    CharToByteConverter fCToB = null;
  +    // is the converter null because it can't be instantiated
  +    // for some reason (perhaps we're running with insufficient authority as 
  +    // an applet?
  +    boolean fHaveTriedCToB = false;
   
       /**
        * Creates new <code>EncodingInfo</code> instance.
        */
  -    public EncodingInfo(String mimeName, String javaName, int lastPrintable) {
  -        this.name = mimeName;
  -        this.javaName = javaName == null ? mimeName : javaName;
  +    public EncodingInfo(String ianaName, String javaName, int lastPrintable) {
  +        this.ianaName = ianaName;
  +        this.javaName = EncodingMap.getIANA2JavaMapping(ianaName);
           this.lastPrintable = lastPrintable;
       }
   
       /**
  -     * Creates new <code>EncodingInfo</code> instance.
  -     */
  -    public EncodingInfo(String mimeName, int lastPrintable) {
  -        this(mimeName, mimeName, lastPrintable);
  -    }
  -
  -    /**
        * Returns a MIME charset name of this encoding.
        */
  -    public String getName() {
  -        return this.name;
  +    public String getIANAName() {
  +        return this.ianaName;
       }
   
       /**
  @@ -107,16 +111,54 @@
        */
       public Writer getWriter(OutputStream output)
           throws UnsupportedEncodingException {
  -        if (this.javaName == null)
  -            return new OutputStreamWriter(output);
  -        return new OutputStreamWriter(output, this.javaName);
  +        // this should always be true!
  +        if (javaName != null) 
  +            return new OutputStreamWriter(output, javaName);
  +        javaName = EncodingMap.getIANA2JavaMapping(ianaName);
  +        if(javaName == null) 
  +            // use UTF-8 as preferred encoding
  +            return new OutputStreamWriter(output, "UTF8");
  +        return new OutputStreamWriter(output, javaName);
       }
       /**
  -     * Checks whether the specified character is printable or not.
  +     * Checks whether the specified character is printable or not
  +     * in this encoding.
        *
        * @param ch a code point (0-0x10ffff)
        */
  -    public boolean isPrintable(int ch) {
  -        return ch <= this.lastPrintable;
  +    public boolean isPrintable(char ch) {
  +        if(ch <= this.lastPrintable) 
  +            return true;
  +        
  +        if(fCToB == null) {
  +            if(fHaveTriedCToB) {
  +                // forget it; nothing we can do...
  +                return false;
  +            }
  +            // try and create it:
  +            try {
  +                fCToB = CharToByteConverter.getConverter(javaName);
  +            } catch(Exception e) {   
  +                // don't try it again...
  +                fHaveTriedCToB = true;
  +                return false;
  +            }
  +        }
  +        try {
  +            return fCToB.canConvert(ch); 
  +        } catch (Exception e) {
  +            // obviously can't use this converter; probably some kind of
  +            // security restriction
  +            fCToB = null;
  +            fHaveTriedCToB = false;
  +            return false;
  +        }
  +    }
  +
  +    // is this an encoding name recognized by this JDK?
  +    // if not, will throw UnsupportedEncodingException
  +    public static void testJavaEncodingName(String name)  throws 
UnsupportedEncodingException {
  +        final byte [] bTest = {'v', 'a', 'l', 'i', 'd'};
  +        String s = new String(bTest, name);
       }
   }
  
  
  
  1.18      +22 -4     xml-xerces/java/src/org/apache/xml/serialize/OutputFormat.java
  
  Index: OutputFormat.java
  ===================================================================
  RCS file: /home/cvs/xml-xerces/java/src/org/apache/xml/serialize/OutputFormat.java,v
  retrieving revision 1.17
  retrieving revision 1.18
  diff -u -r1.17 -r1.18
  --- OutputFormat.java 29 Jan 2002 14:39:39 -0000      1.17
  +++ OutputFormat.java 3 Jan 2003 16:13:27 -0000       1.18
  @@ -68,6 +68,7 @@
   
   
   import java.util.Hashtable;
  +import java.io.UnsupportedEncodingException;
   
   import org.w3c.dom.Document;
   import org.w3c.dom.DocumentType;
  @@ -189,6 +190,9 @@
        */
       private EncodingInfo _encodingInfo = null;
   
  +    // whether java names for encodings are permitted
  +    private boolean _allowJavaNames = false;
  +
       /**
        * The specified media type or null.
        */
  @@ -491,7 +495,7 @@
        * instance.
        */
       public void setEncoding(EncodingInfo encInfo) {
  -        _encoding = encInfo.getName();
  +        _encoding = encInfo.getIANAName();
           _encodingInfo = encInfo;
       }
   
  @@ -500,10 +504,24 @@
        *
        * @see #setEncoding
        */
  -    public EncodingInfo getEncodingInfo() {
  +    public EncodingInfo getEncodingInfo() throws UnsupportedEncodingException {
           if (_encodingInfo == null)
  -            _encodingInfo = Encodings.getEncodingInfo(_encoding);
  +            _encodingInfo = Encodings.getEncodingInfo(_encoding, _allowJavaNames);
           return _encodingInfo;
  +    }
  +
  +    /**
  +     * Sets whether java encoding names are permitted
  +     */
  +    public void setAllowJavaNames (boolean allow) {
  +        _allowJavaNames = allow;
  +    }
  +
  +    /**
  +     * Returns whether java encoding names are permitted
  +     */
  +    public boolean setAllowJavaNames () {
  +        return _allowJavaNames;
       }
   
       /**


---------------------------------------------------------------------
To unsubscribe, e-mail: [EMAIL PROTECTED]
For additional commands, e-mail: [EMAIL PROTECTED]

cvs commit: xml-xerces/java/src/org/apache/xml/serialize Encodings.java EncodingInfo.java OutputFormat.java SieveEncodingInfo.java

Reply via email to