mrglavas 2004/06/15 14:36:39 Modified: java/src/org/apache/xerces/dom DOMNormalizer.java Log: The DOM normalizer was rejecting supplemental characters in
text, comments and CDATA sections. We should now be correctly handling surrogate character pairs. Revision Changes Path 1.56 +156 -100 xml-xerces/java/src/org/apache/xerces/dom/DOMNormalizer.java Index: DOMNormalizer.java =================================================================== RCS file: /home/cvs/xml-xerces/java/src/org/apache/xerces/dom/DOMNormalizer.java,v retrieving revision 1.55 retrieving revision 1.56 diff -u -r1.55 -r1.56 --- DOMNormalizer.java 7 May 2004 21:35:35 -0000 1.55 +++ DOMNormalizer.java 15 Jun 2004 21:36:39 -0000 1.56 @@ -949,25 +949,34 @@ * @param isXML11Version = true if XML 1.1 */ public static final void isCDataWF(DOMErrorHandler errorHandler, DOMErrorImpl error, DOMLocatorImpl locator, - String datavalue, boolean isXML11Version) + String datavalue, boolean isXML11Version) { - if(datavalue == null || (datavalue.length() == 0) ) return ; - + if (datavalue == null || (datavalue.length() == 0) ) { + return; + } + char [] dataarray = datavalue.toCharArray(); - int datalength = dataarray.length ; - - //version of the document is XML 1.1 - if(isXML11Version){ - //we need to check all chracters as per production rules of XML11 - int i = 0 ; + int datalength = dataarray.length; + + // version of the document is XML 1.1 + if (isXML11Version) { + // we need to check all chracters as per production rules of XML11 + int i = 0; while(i < datalength){ char c = dataarray[i++]; - if(XML11Char.isXML11Invalid(c)){ - String msg = - DOMMessageFormatter.formatMessage( - DOMMessageFormatter.XML_DOMAIN, - "InvalidCharInCDSect", - new Object[] { Integer.toString(c, 16)}); + if ( XML11Char.isXML11Invalid(c) ) { + // check if this is a supplemental character + if (XMLChar.isHighSurrogate(c) && i < datalength) { + char c2 = dataarray[i++]; + if (XMLChar.isLowSurrogate(c2) && + XMLChar.isSupplemental(XMLChar.supplemental(c, c2))) { + continue; + } + } + String msg = DOMMessageFormatter.formatMessage( + DOMMessageFormatter.XML_DOMAIN, + "InvalidCharInCDSect", + new Object[] { Integer.toString(c, 16)}); reportDOMError( errorHandler, error, @@ -976,54 +985,63 @@ DOMError.SEVERITY_ERROR, "wf-invalid-character"); } - else if (c==']'){ + else if (c == ']') { int count = i; - if (count<datalength && dataarray[count]==']'){ - while (++count <datalength && dataarray[count]==']'){ + if (count < datalength && dataarray[count] == ']') { + while (++count < datalength && dataarray[count] == ']') { // do nothing } - if (count <datalength && dataarray[count]=='>'){ - //CDEndInContent - String msg = DOMMessageFormatter.formatMessage(DOMMessageFormatter.XML_DOMAIN, - "CDEndInContent", null); - reportDOMError(errorHandler, error, locator,msg, DOMError.SEVERITY_ERROR, "wf-invalid-character"); + if (count < datalength && dataarray[count] == '>') { + // CDEndInContent + String msg = DOMMessageFormatter.formatMessage(DOMMessageFormatter.XML_DOMAIN, "CDEndInContent", null); + reportDOMError(errorHandler, error, locator,msg, DOMError.SEVERITY_ERROR, "wf-invalid-character"); } } } } - }//version of the document is XML 1.0 - else{ - //we need to check all chracters as per production rules of XML 1.0 - int i = 0 ; - while(i < datalength){ + } // version of the document is XML 1.0 + else { + // we need to check all chracters as per production rules of XML 1.0 + int i = 0; + while (i < datalength) { char c = dataarray[i++]; - if( XMLChar.isInvalid(c) ){ - //Note: The key InvalidCharInCDSect from XMLMessages.properties - //is being used to obtain the message and DOM error type - //"wf-invalid-character" is used. Also per DOM it is error but - //as per XML spec. it is fatal error - String msg = DOMMessageFormatter.formatMessage(DOMMessageFormatter.XML_DOMAIN, "InvalidCharInCDSect", new Object[]{Integer.toString(c, 16)}); - reportDOMError(errorHandler, error, locator, msg, DOMError.SEVERITY_ERROR, - "wf-invalid-character"); + if( XMLChar.isInvalid(c) ) { + // check if this is a supplemental character + if (XMLChar.isHighSurrogate(c) && i < datalength) { + char c2 = dataarray[i++]; + if (XMLChar.isLowSurrogate(c2) && + XMLChar.isSupplemental(XMLChar.supplemental(c, c2))) { + continue; + } + } + // Note: The key InvalidCharInCDSect from XMLMessages.properties + // is being used to obtain the message and DOM error type + // "wf-invalid-character" is used. Also per DOM it is error but + // as per XML spec. it is fatal error + String msg = DOMMessageFormatter.formatMessage( + DOMMessageFormatter.XML_DOMAIN, + "InvalidCharInCDSect", + new Object[]{Integer.toString(c, 16)}); + reportDOMError(errorHandler, error, locator, msg, DOMError.SEVERITY_ERROR, "wf-invalid-character"); } - else if (c==']'){ + else if (c==']') { int count = i; - if (count<datalength && dataarray[count]==']'){ - while (++count <datalength && dataarray[count]==']'){ + if ( count< datalength && dataarray[count]==']' ) { + while (++count < datalength && dataarray[count]==']' ) { // do nothing } - if (count <datalength && dataarray[count]=='>'){ - String msg = DOMMessageFormatter.formatMessage(DOMMessageFormatter.XML_DOMAIN,"CDEndInContent", null); - reportDOMError(errorHandler, error, locator, msg, DOMError.SEVERITY_ERROR, "wf-invalid-character"); + if ( count < datalength && dataarray[count]=='>' ) { + String msg = DOMMessageFormatter.formatMessage(DOMMessageFormatter.XML_DOMAIN, "CDEndInContent", null); + reportDOMError(errorHandler, error, locator, msg, DOMError.SEVERITY_ERROR, "wf-invalid-character"); } } } } - }//end-else fDocument.isXMLVersion() + } // end-else fDocument.isXMLVersion() - }//isCDataWF + } // isCDataWF /** * NON-DOM: check for valid XML characters as per the XML version @@ -1031,41 +1049,62 @@ * @param isXML11Version = true if XML 1.1 */ public static final void isXMLCharWF(DOMErrorHandler errorHandler, DOMErrorImpl error, DOMLocatorImpl locator, - String datavalue, boolean isXML11Version) + String datavalue, boolean isXML11Version) { - if(datavalue == null || (datavalue.length() == 0) ) return ; + if ( datavalue == null || (datavalue.length() == 0) ) { + return; + } + char [] dataarray = datavalue.toCharArray(); - int datalength = dataarray.length ; - - //version of the document is XML 1.1 + int datalength = dataarray.length; + + // version of the document is XML 1.1 if(isXML11Version){ //we need to check all characters as per production rules of XML11 int i = 0 ; - while(i < datalength){ + while (i < datalength) { if(XML11Char.isXML11Invalid(dataarray[i++])){ - String msg = DOMMessageFormatter.formatMessage( + // check if this is a supplemental character + char ch = dataarray[i-1]; + if (XMLChar.isHighSurrogate(ch) && i < datalength) { + char ch2 = dataarray[i++]; + if (XMLChar.isLowSurrogate(ch2) && + XMLChar.isSupplemental(XMLChar.supplemental(ch, ch2))) { + continue; + } + } + String msg = DOMMessageFormatter.formatMessage( DOMMessageFormatter.DOM_DOMAIN, "InvalidXMLCharInDOM", new Object[]{Integer.toString(dataarray[i-1], 16)}); - reportDOMError(errorHandler, error, locator, msg, DOMError.SEVERITY_ERROR, - "wf-invalid-character"); - }; + reportDOMError(errorHandler, error, locator, msg, DOMError.SEVERITY_ERROR, + "wf-invalid-character"); + } } - }//version of the document is XML 1.0 + } // version of the document is XML 1.0 else{ - //we need to check all characters as per production rules of XML 1.0 + // we need to check all characters as per production rules of XML 1.0 int i = 0 ; - while(i < datalength){ - if( XMLChar.isInvalid(dataarray[i++]) ){ - String msg = DOMMessageFormatter.formatMessage( + while (i < datalength) { + if( XMLChar.isInvalid(dataarray[i++]) ) { + // check if this is a supplemental character + char ch = dataarray[i-1]; + if (XMLChar.isHighSurrogate(ch) && i < datalength) { + char ch2 = dataarray[i++]; + if (XMLChar.isLowSurrogate(ch2) && + XMLChar.isSupplemental(XMLChar.supplemental(ch, ch2))) { + continue; + } + } + String msg = DOMMessageFormatter.formatMessage( DOMMessageFormatter.DOM_DOMAIN, "InvalidXMLCharInDOM", new Object[]{Integer.toString(dataarray[i-1], 16)}); - reportDOMError(errorHandler, error, locator, msg, DOMError.SEVERITY_ERROR, - "wf-invalid-character"); - }; + reportDOMError(errorHandler, error, locator, msg, DOMError.SEVERITY_ERROR, + "wf-invalid-character"); + } } - }//end-else fDocument.isXMLVersion() + } // end-else fDocument.isXMLVersion() - }//isXMLCharWF + } // isXMLCharWF /** * NON-DOM: check if value of the comment is well-formed @@ -1073,55 +1112,72 @@ * @param isXML11Version = true if XML 1.1 */ public static final void isCommentWF(DOMErrorHandler errorHandler, DOMErrorImpl error, DOMLocatorImpl locator, - String datavalue, boolean isXML11Version) + String datavalue, boolean isXML11Version) { - if(datavalue == null || (datavalue.length() == 0) ) return ; - + if ( datavalue == null || (datavalue.length() == 0) ) { + return; + } + char [] dataarray = datavalue.toCharArray(); int datalength = dataarray.length ; - - //version of the document is XML 1.1 - if(isXML11Version){ - //we need to check all chracters as per production rules of XML11 + + // version of the document is XML 1.1 + if (isXML11Version) { + // we need to check all chracters as per production rules of XML11 int i = 0 ; - while(i < datalength){ + while (i < datalength){ char c = dataarray[i++]; - - if(XML11Char.isXML11Invalid(c)){ - String msg = DOMMessageFormatter.formatMessage(DOMMessageFormatter.XML_DOMAIN, - "InvalidCharInComment", - new Object [] {Integer.toString(dataarray[i-1], 16)}); - reportDOMError(errorHandler, error, locator, msg, DOMError.SEVERITY_ERROR, "wf-invalid-character"); - } - else if (c == '-' && i<datalength && dataarray[i]=='-'){ - String msg = DOMMessageFormatter.formatMessage(DOMMessageFormatter.XML_DOMAIN, - "DashDashInComment", null); - // invalid: '--' in comment - reportDOMError(errorHandler, error, locator, msg, DOMError.SEVERITY_ERROR, "wf-invalid-character"); + if ( XML11Char.isXML11Invalid(c) ) { + // check if this is a supplemental character + if (XMLChar.isHighSurrogate(c) && i < datalength) { + char c2 = dataarray[i++]; + if (XMLChar.isLowSurrogate(c2) && + XMLChar.isSupplemental(XMLChar.supplemental(c, c2))) { + continue; + } + } + String msg = DOMMessageFormatter.formatMessage(DOMMessageFormatter.XML_DOMAIN, + "InvalidCharInComment", + new Object [] {Integer.toString(dataarray[i-1], 16)}); + reportDOMError(errorHandler, error, locator, msg, DOMError.SEVERITY_ERROR, "wf-invalid-character"); + } + else if (c == '-' && i < datalength && dataarray[i] == '-') { + String msg = DOMMessageFormatter.formatMessage(DOMMessageFormatter.XML_DOMAIN, + "DashDashInComment", null); + // invalid: '--' in comment + reportDOMError(errorHandler, error, locator, msg, DOMError.SEVERITY_ERROR, "wf-invalid-character"); } } - }//version of the document is XML 1.0 - else{ - //we need to check all chracters as per production rules of XML 1.0 - int i = 0 ; - while(i < datalength){ + } // version of the document is XML 1.0 + else { + // we need to check all chracters as per production rules of XML 1.0 + int i = 0; + while (i < datalength){ char c = dataarray[i++]; if( XMLChar.isInvalid(c) ){ - String msg = DOMMessageFormatter.formatMessage(DOMMessageFormatter.XML_DOMAIN, - "InvalidCharInComment", new Object [] {Integer.toString(dataarray[i-1], 16)}); - reportDOMError(errorHandler, error, locator, msg, DOMError.SEVERITY_ERROR, "wf-invalid-character"); + // check if this is a supplemental character + if (XMLChar.isHighSurrogate(c) && i < datalength) { + char c2 = dataarray[i++]; + if (XMLChar.isLowSurrogate(c2) && + XMLChar.isSupplemental(XMLChar.supplemental(c, c2))) { + continue; + } + } + String msg = DOMMessageFormatter.formatMessage(DOMMessageFormatter.XML_DOMAIN, + "InvalidCharInComment", new Object [] {Integer.toString(dataarray[i-1], 16)}); + reportDOMError(errorHandler, error, locator, msg, DOMError.SEVERITY_ERROR, "wf-invalid-character"); } else if (c == '-' && i<datalength && dataarray[i]=='-'){ - String msg = DOMMessageFormatter.formatMessage(DOMMessageFormatter.XML_DOMAIN, - "DashDashInComment", null); - // invalid: '--' in comment - reportDOMError(errorHandler, error, locator, msg, DOMError.SEVERITY_ERROR, "wf-invalid-character"); + String msg = DOMMessageFormatter.formatMessage(DOMMessageFormatter.XML_DOMAIN, + "DashDashInComment", null); + // invalid: '--' in comment + reportDOMError(errorHandler, error, locator, msg, DOMError.SEVERITY_ERROR, "wf-invalid-character"); } } - - }//end-else fDocument.isXMLVersion() + + } // end-else fDocument.isXMLVersion() - }//isCommentWF + } // isCommentWF /** NON-DOM: check if attribute value is well-formed * @param attributes --------------------------------------------------------------------- To unsubscribe, e-mail: [EMAIL PROTECTED] For additional commands, e-mail: [EMAIL PROTECTED]