dlr 2005/05/16 14:23:21
Modified: src/java/org/apache/xmlrpc XmlRpc.java XmlWriter.java src/test/org/apache/xmlrpc XmlWriterTest.java Log: Significant changes to handling of character encodings, both character set and XML related, to support the widest range of interoperability while still functioning correctly. * src/java/org/apache/xmlrpc/XmlWriter.java Updated header doc with a link to Tim Bray's annotated XML spec (recommended by John Wilson). (PROLOG_START, PROLOG_END): Removed encoding portion. XML parsers should assume an Unicode encoding (which will work even if we wrote ASCII, since it is a subset). (ISO8859_1, UTF8): Reduced visiblity from protected to package-private. Corrected typo in JavaDoc. (UTF16): New constant for the string "UTF-16". (encodings): Added URL to JavaDoc. (hasWrittenProlog): Added flag indicating whether or not the XML prolog has been written by this Writer instance. (XmlWriter): No longer writes XML prolog on instantiation; that is now delayed until write() is called explicitly. As such, removed throws decl for IOException (which will be backwards compatible, since UnsupportedEncodingException sub-classes IOException). Output encoding is now forced to UTF-8, if specified encoding is non-Unicode. Since XML parsers are required to support UTF-8 and UTF-16, this should be seemless from a caller's perspective (especially since ASCII is a subset of UTF-8). (write): New overload which writes the XML prolog lazily. (writeCharacterReference): New helper method used to write XML character references for single characters (e.g. '\r' as "
"). (chardata): Removed unused local variables enc and isUnicode. Write carriage returns as XML character references (as recommended by John Wilson). Write characters not valid in XML using character references. (isValidXMLChar): Helper function capturing the set of characters known to be valid XML. * src/test/org/apache/xmlrpc/XmlWriterTest.java (buffer, writer): Instance fields used by all tests. (setUp): Initialize buffer to an empty ByteArrayOutputStream. (testForceAlternateEncoding): New test which assures that non-Unicode output encodings are forced to UTF-8 by XmlWriter. (testBasicResults): Renamed from testWriter to provide a somewhat less generic name. Added description messages for assertion failures. Added tests for Boolean. (testWriteCharacterReference): New test for writing characters as XML character references. * src/java/org/apache/xmlrpc/XmlRpc.java (encoding): Changed default output encoding from ISO-8859-1 to UTF-8. Target release: 2.0 Revision Changes Path 1.42 +4 -3 ws-xmlrpc/src/java/org/apache/xmlrpc/XmlRpc.java Index: XmlRpc.java =================================================================== RCS file: /home/cvs/ws-xmlrpc/src/java/org/apache/xmlrpc/XmlRpc.java,v retrieving revision 1.41 retrieving revision 1.42 diff -u -u -r1.41 -r1.42 --- XmlRpc.java 28 Apr 2005 21:26:38 -0000 1.41 +++ XmlRpc.java 16 May 2005 21:23:21 -0000 1.42 @@ -147,9 +147,10 @@ /** * Java's name for the encoding we're using. Defaults to - * <code>ISO8859_1</code>. + * <code>UTF8</code> (of which <code>ISO8859_1</code> is a + * subset). */ - static String encoding = XmlWriter.ISO8859_1; + static String encoding = XmlWriter.UTF8; /** * Java's name for the input encoding we're using. Defaults to 1.14 +117 -50 ws-xmlrpc/src/java/org/apache/xmlrpc/XmlWriter.java Index: XmlWriter.java =================================================================== RCS file: /home/cvs/ws-xmlrpc/src/java/org/apache/xmlrpc/XmlWriter.java,v retrieving revision 1.13 retrieving revision 1.14 diff -u -u -r1.13 -r1.14 --- XmlWriter.java 2 May 2005 04:22:21 -0000 1.13 +++ XmlWriter.java 16 May 2005 21:23:21 -0000 1.14 @@ -32,7 +32,7 @@ import org.apache.commons.codec.EncoderException; /** - * A quick and dirty XML writer. If you feed it a + * A XML writer intended for single-thread usage. If you feed it a * <code>ByteArrayInputStream</code>, it may be necessary to call * <code>writer.flush()</code> before calling * <code>buffer.toByteArray()</code> to get the data written to @@ -40,12 +40,13 @@ * * @author <a href="mailto:[EMAIL PROTECTED]">Hannes Wallnoefer</a> * @author Daniel L. Rall + * @see <a href="http://www.xml.com/axml/testaxml.htm">Tim Bray's + * Annotated XML Spec</a> */ class XmlWriter extends OutputStreamWriter { // Various XML pieces. - protected static final String PROLOG_START = - "<?xml version=\"1.0\" encoding=\""; + protected static final String PROLOG_START = "<?xml version=\"1.0"; protected static final String PROLOG_END = "\"?>"; protected static final String CLOSING_TAG_START = "</"; protected static final String SINGLE_TAG_END = "/>"; @@ -54,14 +55,19 @@ protected static final String AMPERSAND_ENTITY = "&"; /** - * Java's name for the the ISO8859_1 encoding. + * Java's name for the ISO-8859-1 encoding. */ - protected static final String ISO8859_1 = "ISO8859_1"; + static final String ISO8859_1 = "ISO8859_1"; /** - * Java's name for the the UTF8 encoding. + * Java's name for the UTF-8 encoding. */ - protected static final String UTF8 = "UTF8"; + static final String UTF8 = "UTF8"; + + /** + * Java's name for the UTF-16 encoding. + */ + static final String UTF16 = "UTF-16"; protected static final Base64 base64Codec = new Base64(); @@ -73,6 +79,8 @@ /** * Mapping between Java encoding names and "real" names used in * XML prolog. + * + * @see <a href="http://java.sun.com/j2se/1.4.2/docs/guide/intl/encoding.doc.html">Java character set names</a> */ private static Properties encodings = new Properties(); @@ -90,23 +98,40 @@ private static DateTool dateTool = new DateTool(); /** + * Whether the XML prolog has been written. + */ + boolean hasWrittenProlog = false; + + /** * Creates a new instance. * * @param out The stream to write output to. - * @param enc The encoding to using for outputing XML. - * @throws UnsupportedEncodingException Encoding unrecognized. - * @throws IOException Problem writing. + * @param enc The encoding to using for outputing XML. Only UTF-8 + * and UTF-16 are supported. If another encoding is specified, + * UTF-8 will be used instead for widest XML parser + * interoperability. + * @exception UnsupportedEncodingException Since unsupported + * encodings are internally converted to UTF-8, this should only + * be seen as the result of an internal error. */ public XmlWriter(OutputStream out, String enc) - throws UnsupportedEncodingException, IOException + throws UnsupportedEncodingException { // Super-class wants the Java form of the encoding. - super(out, enc); + super(out, forceUnicode(enc)); + } - // Add the XML prolog (including the encoding in XML form). - write(PROLOG_START); - write(canonicalizeEncoding(enc)); - write(PROLOG_END); + /** + * @param encoding A caller-specified encoding. + * @return An Unicode encoding. + */ + private static String forceUnicode(String encoding) + { + if (encoding == null || !encoding.toUpperCase().startsWith("UTF")) + { + encoding = UTF8; + } + return encoding; } /** @@ -116,6 +141,8 @@ * @param javaEncoding The name of the encoding as known by Java. * @return The XML encoding (if a mapping is available); * otherwise, the encoding as provided. + * + * @deprecated This method will not be visible in 2.0. */ protected static String canonicalizeEncoding(String javaEncoding) { @@ -123,6 +150,25 @@ } /** + * A mostly pass-through implementation wrapping + * <code>OutputStreamWriter.write()</code> which assures that the + * XML prolog is written before any other data. + * + * @see java.io.OutputStreamWriter.write(char[], int, int) + */ + public void write(char[] cbuf, int off, int len) + throws IOException + { + if (!hasWrittenProlog) + { + super.write(PROLOG_START, 0, PROLOG_START.length()); + super.write(PROLOG_END, 0, PROLOG_END.length()); + hasWrittenProlog = true; + } + super.write(cbuf, off, len); + } + + /** * Writes the XML representation of a supported Java object type. * * @param obj The <code>Object</code> to write. @@ -246,6 +292,17 @@ } /** + * Writes characters like '\r' (0xd) as "&#13;". + */ + private void writeCharacterReference(char c) + throws IOException + { + write("&#"); + write(String.valueOf((int) c)); + write(';'); + } + + /** * * @param elem * @throws IOException @@ -292,8 +349,6 @@ throws XmlRpcException, IOException { int l = text.length (); - String enc = super.getEncoding(); - boolean isUnicode = UTF8.equals(enc) || "UTF-16".equals(enc); // ### TODO: Use a buffer rather than going character by // ### character to scale better for large text sizes. //char[] buf = new char[32]; @@ -303,10 +358,13 @@ switch (c) { case '\t': - case '\r': case '\n': write(c); break; + case '\r': + // Avoid normalization of CR to LF. + writeCharacterReference(c); + break; case '<': write(LESS_THAN_ENTITY); break; @@ -317,38 +375,18 @@ write(AMPERSAND_ENTITY); break; default: - if (c < 0x20 || c > 0x7f) + // Though the XML spec requires XML parsers to support + // Unicode, not all such code points are valid in XML + // documents. Additionally, previous to 2003-06-30 + // the XML-RPC spec only allowed ASCII data (in + // <string> elements). For interoperability with + // clients rigidly conforming to the pre-2003 version + // of the XML-RPC spec, we entity encode characters + // outside of the valid range for ASCII, too. + if (c > 0x7f || !isValidXMLChar(c)) { - // Though the XML-RPC spec allows any ASCII - // characters except '<' and '&', the XML spec - // does not allow this range of characters, - // resulting in a parse error from most XML - // parsers. However, the XML spec does require - // XML parsers to support UTF-8 and UTF-16. - if (isUnicode) - { - if (c < 0x20) - { - // Entity escape the character. - write("&#"); - // ### Do we really need the String conversion? - write(String.valueOf((int) c)); - write(';'); - } - else // c > 0x7f - { - // Write the character in our encoding. - write(new String(String.valueOf(c).getBytes(enc))); - } - } - else - { - throw new XmlRpcException(0, "Invalid character data " - + "corresponding to XML " - + "entity &#" - + String.valueOf((int) c) - + ';'); - } + // Replace the code point with a character reference. + writeCharacterReference(c); } else { @@ -358,6 +396,35 @@ } } + /** + * Section 2.2 of the XML spec describes which Unicode code points + * are valid in XML: + * + * <blockquote><code>#x9 | #xA | #xD | [#x20-#xD7FF] | + * [#xE000-#xFFFD] | [#x10000-#x10FFFF]</code></blockquote> + * + * Code points outside this set must be entity encoded to be + * represented in XML. + * + * @param c The character to inspect. + * @return Whether the specified character is valid in XML. + */ + private static final boolean isValidXMLChar(char c) + { + switch (c) + { + case 0x9: + case 0xa: // line feed, '\n' + case 0xd: // carriage return, '\r' + return true; + + default: + return ( (0x20 < c && c <= 0xd7ff) || + (0xe000 < c && c <= 0xfffd) || + (0x10000 < c && c <= 0x10ffff) ); + } + } + protected static void setTypeDecoder(TypeDecoder newTypeDecoder) { typeDecoder = newTypeDecoder; 1.11 +64 -10 ws-xmlrpc/src/test/org/apache/xmlrpc/XmlWriterTest.java Index: XmlWriterTest.java =================================================================== RCS file: /home/cvs/ws-xmlrpc/src/test/org/apache/xmlrpc/XmlWriterTest.java,v retrieving revision 1.10 retrieving revision 1.11 diff -u -u -r1.10 -r1.11 --- XmlWriterTest.java 10 May 2005 18:58:45 -0000 1.10 +++ XmlWriterTest.java 16 May 2005 21:23:21 -0000 1.11 @@ -33,6 +33,9 @@ public class XmlWriterTest extends TestCase { + private ByteArrayOutputStream buffer; + private XmlWriter writer; + /** * Constructor */ @@ -55,6 +58,7 @@ public void setUp() { XmlRpc.setDebug(true); + buffer = new ByteArrayOutputStream(); } /** @@ -65,27 +69,63 @@ XmlRpc.setDebug(false); } - public void testWriter() + public void testForceAlternateEncoding() + throws Exception + { + writer = new XmlWriter(buffer, null); + assertEquals("null should be forced to UTF-8", + XmlWriter.UTF8, writer.getEncoding()); + + writer = new XmlWriter(buffer, XmlWriter.ISO8859_1); + assertEquals(XmlWriter.ISO8859_1 + " should be forced to " + + XmlWriter.UTF8, XmlWriter.UTF8, writer.getEncoding()); + + writer = new XmlWriter(buffer, "ISO8859_15"); + assertEquals("ISO8859_15 should be forced to " + XmlWriter.UTF8, + XmlWriter.UTF8, writer.getEncoding()); + + writer = new XmlWriter(buffer, "EUC_JP"); + assertEquals("EUC_JP should be forced to " + XmlWriter.UTF8, + XmlWriter.UTF8, writer.getEncoding()); + + writer = new XmlWriter(buffer, XmlWriter.UTF16); + assertEquals(XmlWriter.UTF16 + " should remain " + XmlWriter.UTF16, + XmlWriter.UTF16, writer.getEncoding()); + } + + public void testBasicResults() throws Exception { try { - ByteArrayOutputStream buffer = new ByteArrayOutputStream(); - XmlWriter writer = new XmlWriter(buffer, XmlWriter.ISO8859_1); - assertTrue(writer.getEncoding().equals(XmlRpc.encoding)); + writer = new XmlWriter(buffer, XmlWriter.UTF8); + + writer.write(new char[0], 0, 0); + writer.flush(); + assertEquals("Unexpected or missing XML prolog", + XmlWriter.PROLOG_START + XmlWriter.PROLOG_END, + buffer.toString()); String foobar = "foobar"; writer.writeObject(foobar); writer.flush(); - //System.err.println("buffer=" + new String(buffer.toByteArray())); String postProlog = "<value>" + foobar + "</value>"; - assertTrue(buffer.toString().endsWith(postProlog)); + assertTrue("Unexpected results from writing of String", + buffer.toString().endsWith(postProlog)); Integer thirtySeven = new Integer(37); writer.writeObject(thirtySeven); writer.flush(); postProlog += "<value><int>" + thirtySeven + "</int></value>"; - assertTrue(buffer.toString().endsWith(postProlog)); + assertTrue("Unexpected results from writing of Integer", + buffer.toString().endsWith(postProlog)); + + Boolean flag = Boolean.TRUE; + writer.writeObject(flag); + writer.flush(); + postProlog += "<value><boolean>1</boolean></value>"; + assertTrue("Unexpected results from writing of Boolean", + buffer.toString().endsWith(postProlog)); Object[] array = { foobar, thirtySeven }; writer.writeObject(array); @@ -94,7 +134,8 @@ postProlog += "<value>" + foobar + "</value>"; postProlog += "<value><int>" + thirtySeven + "</int></value>"; postProlog += "</data></array></value>"; - assertTrue(buffer.toString().endsWith(postProlog)); + assertTrue("Unexpected results from writing of Object[]", + buffer.toString().endsWith(postProlog)); Hashtable map = new Hashtable(); map.put(foobar, thirtySeven); @@ -104,7 +145,8 @@ postProlog += "<name>" + foobar + "</name>"; postProlog += "<value><int>" + thirtySeven + "</int></value>"; postProlog += "</member></struct></value>"; - assertTrue(buffer.toString().endsWith(postProlog)); + assertTrue("Unexpected results from writing of Hashtable", + buffer.toString().endsWith(postProlog)); } catch (Exception e) { @@ -112,4 +154,16 @@ fail(e.getMessage()); } } + + public void testWriteCharacterReference() + throws Exception + { + writer = new XmlWriter(buffer, null); + writer.hasWrittenProlog = true; + writer.writeObject(String.valueOf((char) 0x80)); + writer.flush(); + String postProlog = "<value>€</value>"; + assertTrue("Character reference not created as expected", + buffer.toString().endsWith(postProlog)); + } }