DO NOT REPLY TO THIS EMAIL, BUT PLEASE POST YOUR BUG 
RELATED COMMENTS THROUGH THE WEB INTERFACE AVAILABLE AT
<http://nagoya.apache.org/bugzilla/show_bug.cgi?id=16307>.
ANY REPLY MADE TO THIS MESSAGE WILL NOT BE COLLECTED AND 
INSERTED IN THE BUG DATABASE.

http://nagoya.apache.org/bugzilla/show_bug.cgi?id=16307

Invalid byte 1 of 1-byte UTF-8 sequence - error for seemingly valid unicode characters 
(UTF-8) ...

           Summary: Invalid byte 1 of 1-byte UTF-8 sequence - error for
                    seemingly valid unicode characters (UTF-8) ...
           Product: Xerces2-J
           Version: 2.1.0
          Platform: All
        OS/Version: All
            Status: NEW
          Severity: Major
          Priority: Other
         Component: SAX
        AssignedTo: [EMAIL PROTECTED]
        ReportedBy: [EMAIL PROTECTED]


Following exception is thrown if xml contains characters in range \u0153 to 
\u02DD and \u2013 to \u2123.

Note : For a similar C++ based sample with xercesC 2.1 the same ranges are 
working fine.

java.io.UTFDataFormatException: Invalid byte 1 of 1-byte UTF-8 sequence.
        at org.apache.xerces.impl.io.UTF8Reader.invalidByte(Unknown Source)
        at org.apache.xerces.impl.io.UTF8Reader.read(Unknown Source)
        at org.apache.xerces.impl.XMLEntityManager$EntityScanner.load(Unknown So
urce)
        at org.apache.xerces.impl.XMLEntityManager$EntityScanner.scanContent(Unk
nown Source)
        at org.apache.xerces.impl.XMLDocumentFragmentScannerImpl.scanContent(Unk
nown Source)
        at org.apache.xerces.impl.XMLDocumentFragmentScannerImpl$FragmentContent
Dispatcher.dispatch(Unknown Source)
        at org.apache.xerces.impl.XMLDocumentFragmentScannerImpl.scanDocument(Un
known Source)
        at org.apache.xerces.parsers.DTDConfiguration.parse(Unknown Source)
        at org.apache.xerces.parsers.DTDConfiguration.parse(Unknown Source)
        at org.apache.xerces.parsers.XMLParser.parse(Unknown Source)
        at org.apache.xerces.parsers.AbstractSAXParser.parse(Unknown Source)
        at SAX2Writer.print(SAX2Writer.java:58)
        at SAX2Writer.main(SAX2Writer.java:378)

I'm also including SAX2Writer.java file, modified to test the unicode data 
parsing. In this I'm marking the unicode ranges that work and the ones that
don't. This can be tested by running the SAX2Writer class with arguments -u -V -
S -F. Pl. let me know if you need any more information.

Note: I'm using JDK 1.3.1 on Win2K

Thanx,
Sandeep Desale

------------------- SAX2Writer.java starts here --------------------------

// FrontEnd Plus GUI for JAD
// DeCompiled : SAX2Writer.class

// package sax;

import java.io.*;
import org.xml.sax.*;
import org.xml.sax.helpers.DefaultHandler;
import sax.helpers.AttributesImpl;
import util.Arguments;

public class SAX2Writer extends DefaultHandler
{

    private static final String DEFAULT_PARSER_NAME 
= "org.apache.xerces.parsers.SAXParser";
    private static boolean unicodeTest = false;
    private static boolean setValidation = false;
    private static boolean setNameSpaces = true;
    private static boolean setSchemaSupport = true;
    private static boolean setSchemaFullSupport = false;
    protected PrintWriter out;
    protected boolean canonical;

    public SAX2Writer(boolean flag)
        throws UnsupportedEncodingException
    {
        this(null, flag);
    }

    protected SAX2Writer(String s, boolean flag)
        throws UnsupportedEncodingException
    {
        if(s == null)
            s = "UTF8";
        out = new PrintWriter(new OutputStreamWriter(System.out, s));
        canonical = flag;
    }

    public static void print(String s, String s1, boolean flag)
    {
        try
        {
            SAX2Writer sax2writer = new SAX2Writer(flag);
            XMLReader xmlreader = (XMLReader)Class.forName(s).newInstance();
            String s2 = "http://www.tibco.com/xmlns/ae2xsd/2002/05 s100.xsd 
http://www.tibco.com/xmlns/ae2xsd/2002/05/ae/test/cross/XPschema s1.xsd";
            System.out.println("Validation is " + setValidation);
            xmlreader.setFeature("http://xml.org/sax/features/validation";, 
setValidation);
            xmlreader.setFeature("http://xml.org/sax/features/namespaces";, 
setNameSpaces);
            xmlreader.setFeature
("http://apache.org/xml/features/validation/schema";, setSchemaSupport);
            xmlreader.setFeature
("http://apache.org/xml/features/validation/schema-full-checking";, 
setSchemaFullSupport);
            xmlreader.setProperty
("http://apache.org/xml/properties/schema/external-schemaLocation";, s2);
            xmlreader.setContentHandler(sax2writer);
            xmlreader.setErrorHandler(sax2writer);
            xmlreader.setEntityResolver(sax2writer);
            if(unicodeTest)
            {
                ByteArrayInputStream bytearrayinputstream = new 
ByteArrayInputStream(s1.getBytes());
                xmlreader.parse(new InputSource(bytearrayinputstream));
            } else
            {
                xmlreader.parse(s1);
            }
        }
        catch(Exception exception)
        {
            exception.printStackTrace(System.err);
        }
    }

    public void processingInstruction(String s, String s1)
    {
        out.print("<?");
        out.print(s);
        if(s1 != null && s1.length() > 0)
        {
            out.print(' ');
            out.print(s1);
        }
        out.print("?>");
        out.flush();
    }

    public void startDocument()
    {
        if(!canonical)
        {
            out.println("<?xml version=\"1.0\" encoding=\"UTF-8\"?>");
            out.flush();
        }
    }

    public void startElement(String s, String s1, String s2, Attributes 
attributes)
    {
        System.out.println("Start Element name : " + s1);
        if(attributes != null)
            System.out.println("Attribute count : " + attributes.getLength());
    }

    public void characters(char ac[], int i, int j)
    {
        out.print(normalize(new String(ac, i, j)));
        out.flush();
    }

    public void ignorableWhitespace(char ac[], int i, int j)
    {
        System.out.println("Found ignorable white space*********<");
        for(int k = i; k < j; k++)
            System.out.println(ac[k]);

        System.out.println(">***");
        characters(ac, i, j);
        out.flush();
    }

    public void endElement(String s, String s1, String s2)
    {
        System.out.println("End Element name : " + s1);
    }

    public void warning(SAXParseException saxparseexception)
    {
        System.err.println("[Warning] " + getLocationString(saxparseexception) 
+ ": " + saxparseexception.getMessage());
    }

    public void error(SAXParseException saxparseexception)
        throws SAXException
    {
        System.err.println("[Error] " + getLocationString(saxparseexception) 
+ ": " + saxparseexception.getMessage());
        throw saxparseexception;
    }

    public void fatalError(SAXParseException saxparseexception)
        throws SAXException
    {
        System.err.println("[Fatal Error] " + getLocationString
(saxparseexception) + ": " + saxparseexception.getMessage());
        throw saxparseexception;
    }

    public InputSource resolveEntity(String s, String s1)
        throws SAXException
    {
        System.out.println("resolveEntity called systemid " + s1);
        return new InputSource(s1);
    }

    private String getLocationString(SAXParseException saxparseexception)
    {
        StringBuffer stringbuffer = new StringBuffer();
        String s = saxparseexception.getSystemId();
        if(s != null)
        {
            int i = s.lastIndexOf('/');
            if(i != -1)
                s = s.substring(i + 1);
            stringbuffer.append(s);
        }
        stringbuffer.append(':');
        stringbuffer.append(saxparseexception.getLineNumber());
        stringbuffer.append(':');
        stringbuffer.append(saxparseexception.getColumnNumber());
        return stringbuffer.toString();
    }

    protected String normalize(String s)
    {
        StringBuffer stringbuffer = new StringBuffer();
        int i = s == null ? 0 : s.length();
        for(int j = 0; j < i; j++)
        {
            char c = s.charAt(j);
            switch(c)
            {
            case 60: // '<'
                stringbuffer.append("&lt;");
                break;

            case 62: // '>'
                stringbuffer.append("&gt;");
                break;

            case 38: // '&'
                stringbuffer.append("&amp;");
                break;

            case 34: // '"'
                stringbuffer.append("&quot;");
                break;

            case 10: // '\n'
            case 13: // '\r'
                if(canonical)
                {
                    stringbuffer.append("&#");
                    stringbuffer.append(Integer.toString(c));
                    stringbuffer.append(';');
                    break;
                }
                // fall through

            default:
                stringbuffer.append(c);
                break;
            }
        }

        return stringbuffer.toString();
    }

    protected Attributes sortAttributes(Attributes attributes)
    {
        AttributesImpl attributesimpl = new AttributesImpl();
        int i = attributes == null ? 0 : attributes.getLength();
        for(int j = 0; j < i; j++)
        {
            String s = attributes.getQName(j);
            int k = attributesimpl.getLength();
            int l;
            for(l = 0; l < k; l++)
                if(s.compareTo(attributesimpl.getQName(l)) < 0)
                    break;

            attributesimpl.insertAttributeAt(l, s, attributes.getType(j), 
attributes.getValue(j));
        }

        return attributesimpl;
    }

    public static StringBuffer appendRange(char c, char c1)
    {
        int i = c1 - c;
        StringBuffer stringbuffer = new StringBuffer(i + 1);
        for(int j = 0; j < i; j++)
            stringbuffer.append((char)(c + j));

        return stringbuffer;
    }

    public static String constructXML()
    {
        StringBuffer stringbuffer = new StringBuffer("<Monster><item>");
                                
                                // Fails for this range '\u0100' - '\u0800'
        StringBuffer stringbuffer1 = appendRange('\u0100', '\u0800');
                                
                                // Fails for this range '\u2000' - '\u27BF' 
                                // StringBuffer stringbuffer1 = appendRange
( '\u2000', '\u27BF' );         // Misc symbols
        

                                // Works '\u0900' - '\u1900' 
        // StringBuffer stringbuffer1 = appendRange( '\u0900', '\u1900' );      
        // too many to list...
        
                                // Works '\u1E00' - '\u1FFF'
                                // StringBuffer stringbuffer1 = appendRange
( '\u1E00', '\u1FFF' );         // Extended Latin, Extended Greek
        
                                // Works '\u2800' - '\u28FF' 
                                // StringBuffer stringbuffer1 = appendRange
( '\u2800', '\u28FF' );         // Braille
        
                                // Works '\u3040' - '\u30FF' 
                                // StringBuffer stringbuffer1 = appendRange
( '\u3040', '\u30FF' );         // Hiragana, Katakana
        
                                // Works '\u3200' - '\u51FF'
                                // StringBuffer stringbuffer1 = appendRange
( '\u3200', '\u51FF' );         // CJK letters and months, CJK Compatibility, 
CJK Unified, etc.
        
                                // Works '\u5200' - '\u71FF' 
                                // StringBuffer stringbuffer1 = appendRange
( '\u5200', '\u71FF' );         // CJK letters and months, CJK Compatibility, 
CJK Unified, etc.
        
                                // Works '\u7200' - '\u9FA5' 
                                // StringBuffer stringbuffer1 = appendRange
( '\u7200', '\u9FA5' );         // CJK letters and months, CJK Compatibility, 
CJK Unified, etc.
        
                                // Works '\uA000' - '\uA48C' 
                                // StringBuffer stringbuffer1 = appendRange
( '\uA000', '\uA48C' );         // Yi - omitted Yi Radicals...
        
                                // Works '\uAC00' - '\uD7A3' 
                                // StringBuffer stringbuffer1 = appendRange
( '\uAC00', '\uD7A3' );         // Hangul
        
                                // Works '\uF900' - '\uFA2D' 
                                // StringBuffer stringbuffer1 = appendRange
( '\uF900', '\uFA2D' );         // CJK Compatibility Ideographs        

        try
        {
            byte abyte0[] = stringbuffer1.toString().getBytes("UTF-8");
            stringbuffer.append(new String(abyte0, "UTF-8"));
            stringbuffer.append("</item></Monster>");
        }
        catch(UnsupportedEncodingException unsupportedencodingexception)
        {
            unsupportedencodingexception.printStackTrace();
        }
        return stringbuffer.toString();
    }

    public static void main(String args[])
    {
        Arguments arguments = new Arguments();
        arguments.setUsage(new String[] {
            "usage: java sax.SAX2Writer (options) 
(uri) ...", "", "options:", " -u | -U Unicode test", "  -n | -N  Turn on/off 
namespace [default=on]", "  -v | -V  Turn on/off validation [default=on]", "  -
s | -S  Turn on/off Schema support [default=on]", "  -f | -F  Turn on/off 
Schema full consraint checking  [default=off]", "  -c       Canonical XML 
output.", "  -h       This help screen."
        });
        if(args.length == 0)
        {
            arguments.printUsage();
            System.exit(1);
        }
        boolean flag = false;
        String s = "org.apache.xerces.parsers.SAXParser";
        arguments.parseArgumentTokens(args, new char[] {
            'p'
        });
        for(String s1 = null; (s1 = arguments.getlistFiles()) != null;)
        {
            int i;
label0:
            while((i = arguments.getArguments()) != -1) 
                switch(i)
                {
                case -1: 
                    break label0;

                case 99: // 'c'
                    flag = true;
                    break;

                case 67: // 'C'
                    flag = false;
                    break;

                case 118: // 'v'
                    setValidation = true;
                    break;

                case 86: // 'V'
                    setValidation = false;
                    break;

                case 78: // 'N'
                    setNameSpaces = false;
                    break;

                case 110: // 'n'
                    setNameSpaces = true;
                    break;

                case 112: // 'p'
                    s = arguments.getStringParameter();
                    break;

                case 115: // 's'
                    setSchemaSupport = true;
                    break;

                case 83: // 'S'
                    setSchemaSupport = false;
                    break;

                case 102: // 'f'
                    setSchemaFullSupport = true;
                    break;

                case 70: // 'F'
                    setSchemaFullSupport = false;
                    break;

                case 85: // 'U'
                case 117: // 'u'
                    String s2 = constructXML();
                    unicodeTest = true;
                    s1 = s2;
                    break;

                case 45: // '-'
                case 63: // '?'
                case 104: // 'h'
                    arguments.printUsage();
                    System.exit(1);
                    break;
                }
            System.err.println(s1 + ':');
            print(s, s1, flag);
        }

    }

}

------------------- SAX2Writer.java ends here --------------------------

---------------------------------------------------------------------
To unsubscribe, e-mail: [EMAIL PROTECTED]
For additional commands, e-mail: [EMAIL PROTECTED]

Reply via email to