neilg 2003/01/03 08:13:27
Modified: java/src/org/apache/xml/serialize Encodings.java
EncodingInfo.java OutputFormat.java
Removed: java/src/org/apache/xml/serialize SieveEncodingInfo.java
Log:
Rework handling of encodings within the serializer. Now it should
be able to support the intersection of the IANA encodings that
Xerces recognizes and those for which the JDK has a CharToByte converter.
To maximize portability of serialized documents, by default the serializer
will only accept IANA names for encodings now.
Revision Changes Path
1.6 +68 -42 xml-xerces/java/src/org/apache/xml/serialize/Encodings.java
Index: Encodings.java
===================================================================
RCS file: /home/cvs/xml-xerces/java/src/org/apache/xml/serialize/Encodings.java,v
retrieving revision 1.5
retrieving revision 1.6
diff -u -r1.5 -r1.6
--- Encodings.java 29 Jan 2002 01:15:20 -0000 1.5
+++ Encodings.java 3 Jan 2003 16:13:27 -0000 1.6
@@ -63,6 +63,8 @@
import java.io.OutputStream;
import java.io.OutputStreamWriter;
import java.io.UnsupportedEncodingException;
+import java.util.Hashtable;
+import org.apache.xerces.util.EncodingMap;
/**
@@ -81,19 +83,76 @@
/**
* The last printable character for unknown encodings.
*/
- static final int DefaultLastPrintable = 0x7F;
+ static final int DEFAULT_LAST_PRINTABLE = 0x7F;
+
+ // last printable character for Unicode-compatible encodings
+ static final int LAST_PRINTABLE_UNICODE = 0xffff;
+ // unicode-compliant encodings; can express plane 0
+ static final String[] UNICODE_ENCODINGS = {
+ "Unicode", "UnicodeBig", "UnicodeLittle", "GB2312", "UTF8",
+ };
+ // default (Java) encoding if none supplied:
+ static final String DEFAULT_ENCODING = "UTF8";
+
+ // note that the size of this Hashtable
+ // is bounded by the number of encodings recognized by EncodingMap;
+ // therefore it poses no static mutability risk.
+ static Hashtable _encodings = new Hashtable();
/**
* @param encoding a MIME charset name, or null.
*/
- static EncodingInfo getEncodingInfo(String encoding) {
- if (encoding == null)
- return new EncodingInfo(null, DefaultLastPrintable);
- for (int i = 0; i < _encodings.length; i++) {
- if (_encodings[i].name.equalsIgnoreCase(encoding))
- return _encodings[i];
+ static EncodingInfo getEncodingInfo(String encoding, boolean allowJavaNames)
throws UnsupportedEncodingException {
+ EncodingInfo eInfo = null;
+ if (encoding == null) {
+ if((eInfo = (EncodingInfo)_encodings.get(DEFAULT_ENCODING)) != null)
+ return eInfo;
+ eInfo = new
EncodingInfo(EncodingMap.getJava2IANAMapping(DEFAULT_ENCODING), DEFAULT_ENCODING,
LAST_PRINTABLE_UNICODE);
+ _encodings.put(DEFAULT_ENCODING, eInfo);
+ return eInfo;
+ }
+ // need to convert it to upper case:
+ encoding = encoding.toUpperCase();
+ String jName = EncodingMap.getIANA2JavaMapping(encoding);
+ if(jName == null) {
+ // see if the encoding passed in is a Java encoding name.
+ if(allowJavaNames ) {
+ EncodingInfo.testJavaEncodingName(encoding);
+ if((eInfo = (EncodingInfo)_encodings.get(encoding)) != null)
+ return eInfo;
+ // is it known to be unicode-compliant?
+ int i=0;
+ for(; i<UNICODE_ENCODINGS.length; i++) {
+ if(UNICODE_ENCODINGS[i].equalsIgnoreCase(encoding)) {
+ eInfo = new
EncodingInfo(EncodingMap.getJava2IANAMapping(encoding), encoding,
LAST_PRINTABLE_UNICODE);
+ break;
+ }
+ }
+ if(i == UNICODE_ENCODINGS.length) {
+ eInfo = new
EncodingInfo(EncodingMap.getJava2IANAMapping(encoding), encoding,
DEFAULT_LAST_PRINTABLE);
+ }
+ _encodings.put(encoding, eInfo);
+ return eInfo;
+ } else {
+ throw new UnsupportedEncodingException(encoding);
+ }
+ }
+ if ((eInfo = (EncodingInfo)_encodings.get(jName)) != null)
+ return eInfo;
+ // have to create one...
+ // is it known to be unicode-compliant?
+ int i=0;
+ for(; i<UNICODE_ENCODINGS.length; i++) {
+ if(UNICODE_ENCODINGS[i].equalsIgnoreCase(jName)) {
+ eInfo = new EncodingInfo(encoding, jName, LAST_PRINTABLE_UNICODE);
+ break;
+ }
}
- return new SieveEncodingInfo(encoding, DefaultLastPrintable);
+ if(i == UNICODE_ENCODINGS.length) {
+ eInfo = new EncodingInfo(encoding, jName, DEFAULT_LAST_PRINTABLE);
+ }
+ _encodings.put(jName, eInfo);
+ return eInfo;
}
static final String JIS_DANGER_CHARS
@@ -101,37 +160,4 @@
+"\u2014\u2015\u2016\u2026\u203e\u203e\u2225\u222f\u301c"
+"\uff3c\uff5e\uffe0\uffe1\uffe2\uffe3";
- /**
- * Constructs a list of all the supported encodings.
- */
- private static final EncodingInfo[] _encodings = new EncodingInfo[] {
- new EncodingInfo("ASCII", 0x7F),
- new EncodingInfo("US-ASCII", 0x7F),
- new EncodingInfo("ISO-8859-1", 0xFF),
- new EncodingInfo("ISO-8859-2", 0xFF),
- new EncodingInfo("ISO-8859-3", 0xFF),
- new EncodingInfo("ISO-8859-4", 0xFF),
- new EncodingInfo("ISO-8859-5", 0xFF),
- new EncodingInfo("ISO-8859-6", 0xFF),
- new EncodingInfo("ISO-8859-7", 0xFF),
- new EncodingInfo("ISO-8859-8", 0xFF),
- new EncodingInfo("ISO-8859-9", 0xFF),
- /**
- * Does JDK's converter supprt surrogates?
- * A Java encoding name "UTF-8" is suppoted by JDK 1.2 or later.
- */
- new EncodingInfo("UTF-8", "UTF8", 0x10FFFF),
- /**
- * JDK 1.1 supports "Shift_JIS" as an alias of "SJIS".
- * But JDK 1.2 treats "Shift_JIS" as an alias of "MS932".
- * The JDK 1.2's behavior is invalid against IANA registrations.
- */
- new SieveEncodingInfo("Shift_JIS", "SJIS", 0x7F, JIS_DANGER_CHARS),
- /**
- * "MS932" is supported by JDK 1.2 or later.
- */
- new SieveEncodingInfo("Windows-31J", "MS932", 0x7F, JIS_DANGER_CHARS),
- new SieveEncodingInfo("EUC-JP", null, 0x7F, JIS_DANGER_CHARS),
- new SieveEncodingInfo("ISO-2022-JP", null, 0x7F, JIS_DANGER_CHARS),
- };
}
1.3 +62 -20 xml-xerces/java/src/org/apache/xml/serialize/EncodingInfo.java
Index: EncodingInfo.java
===================================================================
RCS file: /home/cvs/xml-xerces/java/src/org/apache/xml/serialize/EncodingInfo.java,v
retrieving revision 1.2
retrieving revision 1.3
diff -u -r1.2 -r1.3
--- EncodingInfo.java 29 Jan 2002 01:15:20 -0000 1.2
+++ EncodingInfo.java 3 Jan 2003 16:13:27 -0000 1.3
@@ -62,6 +62,8 @@
import java.io.OutputStreamWriter;
import java.io.UnsupportedEncodingException;
import java.io.Writer;
+import sun.io.CharToByteConverter;
+import org.apache.xerces.util.EncodingMap;
/**
* This class represents an encoding.
@@ -70,31 +72,33 @@
*/
public class EncodingInfo {
- String name;
+ // name of encoding as registered with IANA;
+ // preferably a MIME name, but aliases are fine too.
+ String ianaName;
String javaName;
int lastPrintable;
+ // the charToByteConverter we test unusual characters
+ // with
+ CharToByteConverter fCToB = null;
+ // is the converter null because it can't be instantiated
+ // for some reason (perhaps we're running with insufficient authority as
+ // an applet?
+ boolean fHaveTriedCToB = false;
/**
* Creates new <code>EncodingInfo</code> instance.
*/
- public EncodingInfo(String mimeName, String javaName, int lastPrintable) {
- this.name = mimeName;
- this.javaName = javaName == null ? mimeName : javaName;
+ public EncodingInfo(String ianaName, String javaName, int lastPrintable) {
+ this.ianaName = ianaName;
+ this.javaName = EncodingMap.getIANA2JavaMapping(ianaName);
this.lastPrintable = lastPrintable;
}
/**
- * Creates new <code>EncodingInfo</code> instance.
- */
- public EncodingInfo(String mimeName, int lastPrintable) {
- this(mimeName, mimeName, lastPrintable);
- }
-
- /**
* Returns a MIME charset name of this encoding.
*/
- public String getName() {
- return this.name;
+ public String getIANAName() {
+ return this.ianaName;
}
/**
@@ -107,16 +111,54 @@
*/
public Writer getWriter(OutputStream output)
throws UnsupportedEncodingException {
- if (this.javaName == null)
- return new OutputStreamWriter(output);
- return new OutputStreamWriter(output, this.javaName);
+ // this should always be true!
+ if (javaName != null)
+ return new OutputStreamWriter(output, javaName);
+ javaName = EncodingMap.getIANA2JavaMapping(ianaName);
+ if(javaName == null)
+ // use UTF-8 as preferred encoding
+ return new OutputStreamWriter(output, "UTF8");
+ return new OutputStreamWriter(output, javaName);
}
/**
- * Checks whether the specified character is printable or not.
+ * Checks whether the specified character is printable or not
+ * in this encoding.
*
* @param ch a code point (0-0x10ffff)
*/
- public boolean isPrintable(int ch) {
- return ch <= this.lastPrintable;
+ public boolean isPrintable(char ch) {
+ if(ch <= this.lastPrintable)
+ return true;
+
+ if(fCToB == null) {
+ if(fHaveTriedCToB) {
+ // forget it; nothing we can do...
+ return false;
+ }
+ // try and create it:
+ try {
+ fCToB = CharToByteConverter.getConverter(javaName);
+ } catch(Exception e) {
+ // don't try it again...
+ fHaveTriedCToB = true;
+ return false;
+ }
+ }
+ try {
+ return fCToB.canConvert(ch);
+ } catch (Exception e) {
+ // obviously can't use this converter; probably some kind of
+ // security restriction
+ fCToB = null;
+ fHaveTriedCToB = false;
+ return false;
+ }
+ }
+
+ // is this an encoding name recognized by this JDK?
+ // if not, will throw UnsupportedEncodingException
+ public static void testJavaEncodingName(String name) throws
UnsupportedEncodingException {
+ final byte [] bTest = {'v', 'a', 'l', 'i', 'd'};
+ String s = new String(bTest, name);
}
}
1.18 +22 -4 xml-xerces/java/src/org/apache/xml/serialize/OutputFormat.java
Index: OutputFormat.java
===================================================================
RCS file: /home/cvs/xml-xerces/java/src/org/apache/xml/serialize/OutputFormat.java,v
retrieving revision 1.17
retrieving revision 1.18
diff -u -r1.17 -r1.18
--- OutputFormat.java 29 Jan 2002 14:39:39 -0000 1.17
+++ OutputFormat.java 3 Jan 2003 16:13:27 -0000 1.18
@@ -68,6 +68,7 @@
import java.util.Hashtable;
+import java.io.UnsupportedEncodingException;
import org.w3c.dom.Document;
import org.w3c.dom.DocumentType;
@@ -189,6 +190,9 @@
*/
private EncodingInfo _encodingInfo = null;
+ // whether java names for encodings are permitted
+ private boolean _allowJavaNames = false;
+
/**
* The specified media type or null.
*/
@@ -491,7 +495,7 @@
* instance.
*/
public void setEncoding(EncodingInfo encInfo) {
- _encoding = encInfo.getName();
+ _encoding = encInfo.getIANAName();
_encodingInfo = encInfo;
}
@@ -500,10 +504,24 @@
*
* @see #setEncoding
*/
- public EncodingInfo getEncodingInfo() {
+ public EncodingInfo getEncodingInfo() throws UnsupportedEncodingException {
if (_encodingInfo == null)
- _encodingInfo = Encodings.getEncodingInfo(_encoding);
+ _encodingInfo = Encodings.getEncodingInfo(_encoding, _allowJavaNames);
return _encodingInfo;
+ }
+
+ /**
+ * Sets whether java encoding names are permitted
+ */
+ public void setAllowJavaNames (boolean allow) {
+ _allowJavaNames = allow;
+ }
+
+ /**
+ * Returns whether java encoding names are permitted
+ */
+ public boolean setAllowJavaNames () {
+ return _allowJavaNames;
}
/**
---------------------------------------------------------------------
To unsubscribe, e-mail: [EMAIL PROTECTED]
For additional commands, e-mail: [EMAIL PROTECTED]