dlr         2005/05/16 14:23:21

  Modified:    src/java/org/apache/xmlrpc XmlRpc.java XmlWriter.java
               src/test/org/apache/xmlrpc XmlWriterTest.java
  Log:
  Significant changes to handling of character encodings, both character
  set and XML related, to support the widest range of interoperability
  while still functioning correctly.
  
  * src/java/org/apache/xmlrpc/XmlWriter.java
    Updated header doc with a link to Tim Bray's annotated XML spec
    (recommended by John Wilson).
  
    (PROLOG_START, PROLOG_END): Removed encoding portion.  XML parsers
     should assume an Unicode encoding (which will work even if we wrote
     ASCII, since it is a subset).
  
    (ISO8859_1, UTF8): Reduced visiblity from protected to
     package-private. Corrected typo in JavaDoc.
  
    (UTF16): New constant for the string "UTF-16".
  
    (encodings): Added URL to JavaDoc.
  
    (hasWrittenProlog): Added flag indicating whether or not the XML
     prolog has been written by this Writer instance.
  
    (XmlWriter): No longer writes XML prolog on instantiation; that is
     now delayed until write() is called explicitly.  As such, removed
     throws decl for IOException (which will be backwards compatible,
     since UnsupportedEncodingException sub-classes IOException).
     Output encoding is now forced to UTF-8, if specified encoding is
     non-Unicode.  Since XML parsers are required to support UTF-8 and
     UTF-16, this should be seemless from a caller's perspective
     (especially since ASCII is a subset of UTF-8).
  
    (write): New overload which writes the XML prolog lazily.
  
    (writeCharacterReference): New helper method used to write XML
     character references for single characters (e.g. '\r' as "&#13").
  
    (chardata): Removed unused local variables enc and isUnicode.  Write
     carriage returns as XML character references (as recommended by
     John Wilson).  Write characters not valid in XML using character
     references.
  
    (isValidXMLChar): Helper function capturing the set of characters
     known to be valid XML.
  
  * src/test/org/apache/xmlrpc/XmlWriterTest.java
    (buffer, writer): Instance fields used by all tests.
  
    (setUp): Initialize buffer to an empty ByteArrayOutputStream.
  
    (testForceAlternateEncoding): New test which assures that
     non-Unicode output encodings are forced to UTF-8 by XmlWriter.
  
    (testBasicResults): Renamed from testWriter to provide a somewhat
     less generic name.  Added description messages for assertion
     failures.  Added tests for Boolean.
  
    (testWriteCharacterReference): New test for writing characters as
     XML character references.
  
  * src/java/org/apache/xmlrpc/XmlRpc.java
    (encoding): Changed default output encoding from ISO-8859-1 to
     UTF-8.
  
  Target release: 2.0
  
  Revision  Changes    Path
  1.42      +4 -3      ws-xmlrpc/src/java/org/apache/xmlrpc/XmlRpc.java
  
  Index: XmlRpc.java
  ===================================================================
  RCS file: /home/cvs/ws-xmlrpc/src/java/org/apache/xmlrpc/XmlRpc.java,v
  retrieving revision 1.41
  retrieving revision 1.42
  diff -u -u -r1.41 -r1.42
  --- XmlRpc.java       28 Apr 2005 21:26:38 -0000      1.41
  +++ XmlRpc.java       16 May 2005 21:23:21 -0000      1.42
  @@ -147,9 +147,10 @@
   
       /**
        * Java's name for the encoding we're using.  Defaults to
  -     * <code>ISO8859_1</code>.
  +     * <code>UTF8</code> (of which <code>ISO8859_1</code> is a
  +     * subset).
        */
  -    static String encoding = XmlWriter.ISO8859_1;
  +    static String encoding = XmlWriter.UTF8;
   
       /**
        * Java's name for the input encoding we're using.  Defaults to
  
  
  
  1.14      +117 -50   ws-xmlrpc/src/java/org/apache/xmlrpc/XmlWriter.java
  
  Index: XmlWriter.java
  ===================================================================
  RCS file: /home/cvs/ws-xmlrpc/src/java/org/apache/xmlrpc/XmlWriter.java,v
  retrieving revision 1.13
  retrieving revision 1.14
  diff -u -u -r1.13 -r1.14
  --- XmlWriter.java    2 May 2005 04:22:21 -0000       1.13
  +++ XmlWriter.java    16 May 2005 21:23:21 -0000      1.14
  @@ -32,7 +32,7 @@
   import org.apache.commons.codec.EncoderException;
   
   /**
  - * A quick and dirty XML writer.  If you feed it a
  + * A XML writer intended for single-thread usage.  If you feed it a
    * <code>ByteArrayInputStream</code>, it may be necessary to call
    * <code>writer.flush()</code> before calling
    * <code>buffer.toByteArray()</code> to get the data written to
  @@ -40,12 +40,13 @@
    *
    * @author <a href="mailto:[EMAIL PROTECTED]">Hannes Wallnoefer</a>
    * @author Daniel L. Rall
  + * @see <a href="http://www.xml.com/axml/testaxml.htm";>Tim Bray's
  + * Annotated XML Spec</a>
    */
   class XmlWriter extends OutputStreamWriter
   {
       // Various XML pieces.
  -    protected static final String PROLOG_START =
  -        "<?xml version=\"1.0\" encoding=\"";
  +    protected static final String PROLOG_START = "<?xml version=\"1.0";
       protected static final String PROLOG_END = "\"?>";
       protected static final String CLOSING_TAG_START = "</";
       protected static final String SINGLE_TAG_END = "/>";
  @@ -54,14 +55,19 @@
       protected static final String AMPERSAND_ENTITY = "&amp;";
   
       /**
  -     * Java's name for the the ISO8859_1 encoding.
  +     * Java's name for the ISO-8859-1 encoding.
        */
  -    protected static final String ISO8859_1 = "ISO8859_1";
  +    static final String ISO8859_1 = "ISO8859_1";
   
       /**
  -     * Java's name for the the UTF8 encoding.
  +     * Java's name for the UTF-8 encoding.
        */
  -    protected static final String UTF8 = "UTF8";
  +    static final String UTF8 = "UTF8";
  +
  +    /**
  +     * Java's name for the UTF-16 encoding.
  +     */
  +    static final String UTF16 = "UTF-16";
       
       protected static final Base64 base64Codec = new Base64();
   
  @@ -73,6 +79,8 @@
       /**
        * Mapping between Java encoding names and "real" names used in
        * XML prolog.
  +     *
  +     * @see <a 
href="http://java.sun.com/j2se/1.4.2/docs/guide/intl/encoding.doc.html";>Java 
character set names</a>
        */
       private static Properties encodings = new Properties();
   
  @@ -90,23 +98,40 @@
       private static DateTool dateTool = new DateTool();
   
       /**
  +     * Whether the XML prolog has been written.
  +     */
  +    boolean hasWrittenProlog = false;
  +
  +    /**
        * Creates a new instance.
        *
        * @param out The stream to write output to.
  -     * @param enc The encoding to using for outputing XML.
  -     * @throws UnsupportedEncodingException Encoding unrecognized.
  -     * @throws IOException Problem writing.
  +     * @param enc The encoding to using for outputing XML.  Only UTF-8
  +     * and UTF-16 are supported.  If another encoding is specified,
  +     * UTF-8 will be used instead for widest XML parser
  +     * interoperability.
  +     * @exception UnsupportedEncodingException Since unsupported
  +     * encodings are internally converted to UTF-8, this should only
  +     * be seen as the result of an internal error.
        */
       public XmlWriter(OutputStream out, String enc)
  -        throws UnsupportedEncodingException, IOException
  +        throws UnsupportedEncodingException
       {
           // Super-class wants the Java form of the encoding.
  -        super(out, enc);
  +        super(out, forceUnicode(enc));
  +    }
   
  -        // Add the XML prolog (including the encoding in XML form).
  -        write(PROLOG_START);
  -        write(canonicalizeEncoding(enc));
  -        write(PROLOG_END);
  +    /**
  +     * @param encoding A caller-specified encoding.
  +     * @return An Unicode encoding.
  +     */
  +    private static String forceUnicode(String encoding)
  +    {
  +        if (encoding == null || !encoding.toUpperCase().startsWith("UTF"))
  +        {
  +            encoding = UTF8;
  +        }
  +        return encoding;
       }
   
       /**
  @@ -116,6 +141,8 @@
        * @param javaEncoding The name of the encoding as known by Java.
        * @return The XML encoding (if a mapping is available);
        * otherwise, the encoding as provided.
  +     *
  +     * @deprecated This method will not be visible in 2.0.
        */
       protected static String canonicalizeEncoding(String javaEncoding)
       {
  @@ -123,6 +150,25 @@
       }
   
       /**
  +     * A mostly pass-through implementation wrapping
  +     * <code>OutputStreamWriter.write()</code> which assures that the
  +     * XML prolog is written before any other data.
  +     *
  +     * @see java.io.OutputStreamWriter.write(char[], int, int)
  +     */
  +    public void write(char[] cbuf, int off, int len)
  +        throws IOException
  +    {
  +        if (!hasWrittenProlog)
  +        {
  +            super.write(PROLOG_START, 0, PROLOG_START.length());
  +            super.write(PROLOG_END, 0, PROLOG_END.length());
  +            hasWrittenProlog = true;
  +        }
  +        super.write(cbuf, off, len);
  +    }
  +
  +    /**
        * Writes the XML representation of a supported Java object type.
        *
        * @param obj The <code>Object</code> to write.
  @@ -246,6 +292,17 @@
       }
   
       /**
  +     * Writes characters like '\r' (0xd) as "&amp;#13;".
  +     */
  +    private void writeCharacterReference(char c)
  +        throws IOException
  +    {
  +        write("&#");
  +        write(String.valueOf((int) c));
  +        write(';');
  +    }
  +
  +    /**
        *
        * @param elem
        * @throws IOException
  @@ -292,8 +349,6 @@
           throws XmlRpcException, IOException
       {
           int l = text.length ();
  -        String enc = super.getEncoding();
  -        boolean isUnicode = UTF8.equals(enc) || "UTF-16".equals(enc);
           // ### TODO: Use a buffer rather than going character by
           // ### character to scale better for large text sizes.
           //char[] buf = new char[32];
  @@ -303,10 +358,13 @@
               switch (c)
               {
               case '\t':
  -            case '\r':
               case '\n':
                   write(c);
                   break;
  +            case '\r':
  +                // Avoid normalization of CR to LF.
  +                writeCharacterReference(c);
  +                break;
               case '<':
                   write(LESS_THAN_ENTITY);
                   break;
  @@ -317,38 +375,18 @@
                   write(AMPERSAND_ENTITY);
                   break;
               default:
  -                if (c < 0x20 || c > 0x7f)
  +                // Though the XML spec requires XML parsers to support
  +                // Unicode, not all such code points are valid in XML
  +                // documents.  Additionally, previous to 2003-06-30
  +                // the XML-RPC spec only allowed ASCII data (in
  +                // <string> elements).  For interoperability with
  +                // clients rigidly conforming to the pre-2003 version
  +                // of the XML-RPC spec, we entity encode characters
  +                // outside of the valid range for ASCII, too.
  +                if (c > 0x7f || !isValidXMLChar(c))
                   {
  -                    // Though the XML-RPC spec allows any ASCII
  -                    // characters except '<' and '&', the XML spec
  -                    // does not allow this range of characters,
  -                    // resulting in a parse error from most XML
  -                    // parsers.  However, the XML spec does require
  -                    // XML parsers to support UTF-8 and UTF-16.
  -                    if (isUnicode)
  -                    {
  -                        if (c < 0x20)
  -                        {
  -                            // Entity escape the character.
  -                            write("&#");
  -                            // ### Do we really need the String conversion?
  -                            write(String.valueOf((int) c));
  -                            write(';');
  -                        }
  -                        else // c > 0x7f
  -                        {
  -                            // Write the character in our encoding.
  -                            write(new 
String(String.valueOf(c).getBytes(enc)));
  -                        }
  -                    }
  -                    else
  -                    {
  -                        throw new XmlRpcException(0, "Invalid character data 
"
  -                                                  + "corresponding to XML "
  -                                                  + "entity &#"
  -                                                  + String.valueOf((int) c)
  -                                                  + ';');
  -                    }
  +                    // Replace the code point with a character reference.
  +                    writeCharacterReference(c);
                   }
                   else
                   {
  @@ -358,6 +396,35 @@
           }
       }
   
  +    /**
  +     * Section 2.2 of the XML spec describes which Unicode code points
  +     * are valid in XML:
  +     *
  +     * <blockquote><code>#x9 | #xA | #xD | [#x20-#xD7FF] |
  +     * [#xE000-#xFFFD] | [#x10000-#x10FFFF]</code></blockquote>
  +     *
  +     * Code points outside this set must be entity encoded to be
  +     * represented in XML.
  +     *
  +     * @param c The character to inspect.
  +     * @return Whether the specified character is valid in XML.
  +     */
  +    private static final boolean isValidXMLChar(char c)
  +    {
  +        switch (c)
  +        {
  +        case 0x9:
  +        case 0xa:  // line feed, '\n'
  +        case 0xd:  // carriage return, '\r'
  +            return true;
  +
  +        default:
  +            return ( (0x20 < c && c <= 0xd7ff) ||
  +                     (0xe000 < c && c <= 0xfffd) ||
  +                     (0x10000 < c && c <= 0x10ffff) );
  +        }
  +    }
  +
       protected static void setTypeDecoder(TypeDecoder newTypeDecoder)
       {
           typeDecoder = newTypeDecoder;
  
  
  
  1.11      +64 -10    ws-xmlrpc/src/test/org/apache/xmlrpc/XmlWriterTest.java
  
  Index: XmlWriterTest.java
  ===================================================================
  RCS file: /home/cvs/ws-xmlrpc/src/test/org/apache/xmlrpc/XmlWriterTest.java,v
  retrieving revision 1.10
  retrieving revision 1.11
  diff -u -u -r1.10 -r1.11
  --- XmlWriterTest.java        10 May 2005 18:58:45 -0000      1.10
  +++ XmlWriterTest.java        16 May 2005 21:23:21 -0000      1.11
  @@ -33,6 +33,9 @@
   public class XmlWriterTest
       extends TestCase 
   {
  +    private ByteArrayOutputStream buffer;
  +    private XmlWriter writer;
  +
       /**
        * Constructor
        */
  @@ -55,6 +58,7 @@
       public void setUp() 
       {
           XmlRpc.setDebug(true);
  +        buffer = new ByteArrayOutputStream();
       }
      
       /**
  @@ -65,27 +69,63 @@
           XmlRpc.setDebug(false);
       }
   
  -    public void testWriter()
  +    public void testForceAlternateEncoding()
  +        throws Exception
  +    {
  +        writer = new XmlWriter(buffer, null);
  +        assertEquals("null should be forced to UTF-8",
  +                     XmlWriter.UTF8, writer.getEncoding());
  +
  +        writer = new XmlWriter(buffer, XmlWriter.ISO8859_1);
  +        assertEquals(XmlWriter.ISO8859_1 + " should be forced to " +
  +                     XmlWriter.UTF8, XmlWriter.UTF8, writer.getEncoding());
  +
  +        writer = new XmlWriter(buffer, "ISO8859_15");
  +        assertEquals("ISO8859_15 should be forced to " + XmlWriter.UTF8,
  +                     XmlWriter.UTF8, writer.getEncoding());
  +
  +        writer = new XmlWriter(buffer, "EUC_JP");
  +        assertEquals("EUC_JP should be forced to " + XmlWriter.UTF8,
  +                     XmlWriter.UTF8, writer.getEncoding());
  +
  +        writer = new XmlWriter(buffer, XmlWriter.UTF16);
  +        assertEquals(XmlWriter.UTF16 + " should remain " + XmlWriter.UTF16,
  +                     XmlWriter.UTF16, writer.getEncoding());
  +    }
  +
  +    public void testBasicResults()
           throws Exception
       {
           try
           {
  -            ByteArrayOutputStream buffer = new ByteArrayOutputStream();
  -            XmlWriter writer = new XmlWriter(buffer, XmlWriter.ISO8859_1);
  -            assertTrue(writer.getEncoding().equals(XmlRpc.encoding));
  +            writer = new XmlWriter(buffer, XmlWriter.UTF8);
  +
  +            writer.write(new char[0], 0, 0);
  +            writer.flush();
  +            assertEquals("Unexpected or missing XML prolog",
  +                         XmlWriter.PROLOG_START + XmlWriter.PROLOG_END,
  +                         buffer.toString());
   
               String foobar = "foobar";
               writer.writeObject(foobar);
               writer.flush();
  -            //System.err.println("buffer=" + new 
String(buffer.toByteArray()));
               String postProlog = "<value>" + foobar + "</value>";
  -            assertTrue(buffer.toString().endsWith(postProlog));
  +            assertTrue("Unexpected results from writing of String",
  +                       buffer.toString().endsWith(postProlog));
   
               Integer thirtySeven = new Integer(37);
               writer.writeObject(thirtySeven);
               writer.flush();
               postProlog += "<value><int>" + thirtySeven + "</int></value>";
  -            assertTrue(buffer.toString().endsWith(postProlog));
  +            assertTrue("Unexpected results from writing of Integer",
  +                       buffer.toString().endsWith(postProlog));
  +
  +            Boolean flag = Boolean.TRUE;
  +            writer.writeObject(flag);
  +            writer.flush();
  +            postProlog += "<value><boolean>1</boolean></value>";
  +            assertTrue("Unexpected results from writing of Boolean",
  +                       buffer.toString().endsWith(postProlog));
   
               Object[] array = { foobar, thirtySeven };
               writer.writeObject(array);
  @@ -94,7 +134,8 @@
               postProlog += "<value>" + foobar + "</value>";
               postProlog += "<value><int>" + thirtySeven + "</int></value>";
               postProlog += "</data></array></value>";
  -            assertTrue(buffer.toString().endsWith(postProlog));
  +            assertTrue("Unexpected results from writing of Object[]",
  +                       buffer.toString().endsWith(postProlog));
   
               Hashtable map = new Hashtable();
               map.put(foobar, thirtySeven);
  @@ -104,7 +145,8 @@
               postProlog += "<name>" + foobar + "</name>";
               postProlog += "<value><int>" + thirtySeven + "</int></value>";
               postProlog += "</member></struct></value>";
  -            assertTrue(buffer.toString().endsWith(postProlog));
  +            assertTrue("Unexpected results from writing of Hashtable",
  +                       buffer.toString().endsWith(postProlog));
           }
           catch (Exception e)
           {
  @@ -112,4 +154,16 @@
               fail(e.getMessage());
           }
       }
  +
  +    public void testWriteCharacterReference()
  +        throws Exception
  +    {
  +        writer = new XmlWriter(buffer, null);
  +        writer.hasWrittenProlog = true;
  +        writer.writeObject(String.valueOf((char) 0x80));
  +        writer.flush();
  +        String postProlog = "<value>&#128;</value>";
  +        assertTrue("Character reference not created as expected",
  +                   buffer.toString().endsWith(postProlog));
  +    }
   }
  
  
  

Reply via email to