mrglavas 2003/12/16 13:37:11 Modified: java/src/org/apache/xerces/impl XML11NSDocumentScannerImpl.java XML11DocumentScannerImpl.java XML11DTDScannerImpl.java XMLDocumentScannerImpl.java XML11EntityScanner.java XMLDocumentFragmentScannerImpl.java XMLScanner.java java/src/org/apache/xerces/util XML11Char.java Log: Fixing Bugs 24886, 25571, 25572 and 25573. 24886 - Names in XML 1.1 are now allowed to contain supplemental characters. Support for this was missed from the implementation. We need to check for surrogate character pairs in all names. Reorganzied the order of checks in the dispatchers so that we do checks for regular name characters early and then as a last check, check for high surrogates of name chars. We should now be able to determine the well-formedness and validity of names containing supplemental characters. This should also fix some of the 1.1 support in DOM Level 3. 25571 - An internal buffer wasn't being cleared which caused attribute values containing supplemental characters to become corrupted. 25572 - Supplemental characters were being rejected in comments in XML 1.0 and 1.1 documents. 25573 - Root elements whose names start with any 1.1 NameStartChar were being rejected. We didn't have the proper hook in the base class for the 1.1 document scanner. Revision Changes Path 1.5 +7 -3 xml-xerces/java/src/org/apache/xerces/impl/XML11NSDocumentScannerImpl.java Index: XML11NSDocumentScannerImpl.java =================================================================== RCS file: /home/cvs/xml-xerces/java/src/org/apache/xerces/impl/XML11NSDocumentScannerImpl.java,v retrieving revision 1.4 retrieving revision 1.5 diff -u -r1.4 -r1.5 --- XML11NSDocumentScannerImpl.java 22 Oct 2003 18:33:43 -0000 1.4 +++ XML11NSDocumentScannerImpl.java 16 Dec 2003 21:37:10 -0000 1.5 @@ -208,9 +208,13 @@ empty = true; break; } else if (!isValidNameStartChar(c) || !sawSpace) { - reportFatalError( - "ElementUnterminated", - new Object[] { rawname }); + // Second chance. Check if this character is a high + // surrogate of a valid name start character. + if (!isValidNameStartHighSurrogate(c) || !sawSpace) { + reportFatalError( + "ElementUnterminated", + new Object[] { rawname }); + } } // attributes 1.14 +10 -1 xml-xerces/java/src/org/apache/xerces/impl/XML11DocumentScannerImpl.java Index: XML11DocumentScannerImpl.java =================================================================== RCS file: /home/cvs/xml-xerces/java/src/org/apache/xerces/impl/XML11DocumentScannerImpl.java,v retrieving revision 1.13 retrieving revision 1.14 diff -u -r1.13 -r1.14 --- XML11DocumentScannerImpl.java 13 Nov 2003 18:45:59 -0000 1.13 +++ XML11DocumentScannerImpl.java 16 Dec 2003 21:37:10 -0000 1.14 @@ -359,6 +359,7 @@ } } else if (c != -1 && XMLChar.isHighSurrogate(c)) { + fStringBuffer3.clear(); if (scanSurrogates(fStringBuffer3)) { fStringBuffer.append(fStringBuffer3); if (entityDepth == fEntityDepth) { @@ -502,6 +503,14 @@ protected boolean isValidNameStartChar(int value) { return (XML11Char.isXML11NameStart(value)); } // isValidNameStartChar(int): boolean + + // returns true if the given character is + // a valid high surrogate for a nameStartChar + // with respect to the version of XML understood + // by this scanner. + protected boolean isValidNameStartHighSurrogate(int value) { + return XML11Char.isXML11NameHighSurrogate(value); + } // isValidNameStartHighSurrogate(int): boolean protected boolean versionSupported(String version) { return (version.equals("1.1") || version.equals("1.0")); 1.8 +9 -1 xml-xerces/java/src/org/apache/xerces/impl/XML11DTDScannerImpl.java Index: XML11DTDScannerImpl.java =================================================================== RCS file: /home/cvs/xml-xerces/java/src/org/apache/xerces/impl/XML11DTDScannerImpl.java,v retrieving revision 1.7 retrieving revision 1.8 diff -u -r1.7 -r1.8 --- XML11DTDScannerImpl.java 25 Jul 2003 19:41:10 -0000 1.7 +++ XML11DTDScannerImpl.java 16 Dec 2003 21:37:10 -0000 1.8 @@ -223,6 +223,14 @@ protected boolean isValidNameStartChar(int value) { return (XML11Char.isXML11NameStart(value)); } // isValidNameStartChar(int): boolean + + // returns true if the given character is + // a valid high surrogate for a nameStartChar + // with respect to the version of XML understood + // by this scanner. + protected boolean isValidNameStartHighSurrogate(int value) { + return XML11Char.isXML11NameHighSurrogate(value); + } // isValidNameStartHighSurrogate(int): boolean // note that, according to 4.3.4 of the XML 1.1 spec, XML 1.1 // documents may invoke 1.0 entities; thus either version decl (or none!) 1.35 +19 -8 xml-xerces/java/src/org/apache/xerces/impl/XMLDocumentScannerImpl.java Index: XMLDocumentScannerImpl.java =================================================================== RCS file: /home/cvs/xml-xerces/java/src/org/apache/xerces/impl/XMLDocumentScannerImpl.java,v retrieving revision 1.34 retrieving revision 1.35 diff -u -r1.34 -r1.35 --- XMLDocumentScannerImpl.java 7 Nov 2003 00:26:17 -0000 1.34 +++ XMLDocumentScannerImpl.java 16 Dec 2003 21:37:10 -0000 1.35 @@ -757,11 +757,7 @@ } case SCANNER_STATE_START_OF_MARKUP: { fMarkupDepth++; - if (fEntityScanner.skipChar('?')) { - setScannerState(SCANNER_STATE_PI); - again = true; - } - else if (fEntityScanner.skipChar('!')) { + if (fEntityScanner.skipChar('!')) { if (fEntityScanner.skipChar('-')) { if (!fEntityScanner.skipChar('-')) { reportFatalError("InvalidCommentStart", @@ -779,7 +775,16 @@ null); } } - else if (XMLChar.isNameStart(fEntityScanner.peekChar())) { + else if (isValidNameStartChar(fEntityScanner.peekChar())) { + setScannerState(SCANNER_STATE_ROOT_ELEMENT); + setDispatcher(fContentDispatcher); + return true; + } + else if (fEntityScanner.skipChar('?')) { + setScannerState(SCANNER_STATE_PI); + again = true; + } + else if (isValidNameStartHighSurrogate(fEntityScanner.peekChar())) { setScannerState(SCANNER_STATE_ROOT_ELEMENT); setDispatcher(fContentDispatcher); return true; @@ -1142,7 +1147,13 @@ null); again = true; } - else if (XMLChar.isNameStart(fEntityScanner.peekChar())) { + else if (isValidNameStartChar(fEntityScanner.peekChar())) { + reportFatalError("MarkupNotRecognizedInMisc", + null); + scanStartElement(); + setScannerState(SCANNER_STATE_CONTENT); + } + else if (isValidNameStartHighSurrogate(fEntityScanner.peekChar())) { reportFatalError("MarkupNotRecognizedInMisc", null); scanStartElement(); 1.10 +374 -57 xml-xerces/java/src/org/apache/xerces/impl/XML11EntityScanner.java Index: XML11EntityScanner.java =================================================================== RCS file: /home/cvs/xml-xerces/java/src/org/apache/xerces/impl/XML11EntityScanner.java,v retrieving revision 1.9 retrieving revision 1.10 diff -u -r1.9 -r1.10 --- XML11EntityScanner.java 13 Nov 2003 18:45:59 -0000 1.9 +++ XML11EntityScanner.java 16 Dec 2003 21:37:10 -0000 1.10 @@ -183,26 +183,80 @@ // scan nmtoken int offset = fCurrentEntity.position; - while (XML11Char.isXML11Name(fCurrentEntity.ch[fCurrentEntity.position])) { - if (++fCurrentEntity.position == fCurrentEntity.count) { - int length = fCurrentEntity.position - offset; - if (length == fCurrentEntity.ch.length) { - // bad luck we have to resize our buffer - char[] tmp = new char[fCurrentEntity.ch.length << 1]; - System.arraycopy(fCurrentEntity.ch, offset, - tmp, 0, length); - fCurrentEntity.ch = tmp; - } - else { - System.arraycopy(fCurrentEntity.ch, offset, - fCurrentEntity.ch, 0, length); + + do { + char ch = fCurrentEntity.ch[fCurrentEntity.position]; + if (XML11Char.isXML11Name(ch)) { + if (++fCurrentEntity.position == fCurrentEntity.count) { + int length = fCurrentEntity.position - offset; + if (length == fCurrentEntity.ch.length) { + // bad luck we have to resize our buffer + char[] tmp = new char[fCurrentEntity.ch.length << 1]; + System.arraycopy(fCurrentEntity.ch, offset, + tmp, 0, length); + fCurrentEntity.ch = tmp; + } + else { + System.arraycopy(fCurrentEntity.ch, offset, + fCurrentEntity.ch, 0, length); + } + offset = 0; + if (load(length, false)) { + break; + } } - offset = 0; - if (load(length, false)) { + } + else if (XML11Char.isXML11NameHighSurrogate(ch)) { + if (++fCurrentEntity.position == fCurrentEntity.count) { + int length = fCurrentEntity.position - offset; + if (length == fCurrentEntity.ch.length) { + // bad luck we have to resize our buffer + char[] tmp = new char[fCurrentEntity.ch.length << 1]; + System.arraycopy(fCurrentEntity.ch, offset, + tmp, 0, length); + fCurrentEntity.ch = tmp; + } + else { + System.arraycopy(fCurrentEntity.ch, offset, + fCurrentEntity.ch, 0, length); + } + offset = 0; + if (load(length, false)) { + --fCurrentEntity.position; + break; + } + } + char ch2 = fCurrentEntity.ch[fCurrentEntity.position]; + if ( !XMLChar.isLowSurrogate(ch2) || + !XML11Char.isXML11Name(XMLChar.supplemental(ch, ch2)) ) { + --fCurrentEntity.position; break; } + if (++fCurrentEntity.position == fCurrentEntity.count) { + int length = fCurrentEntity.position - offset; + if (length == fCurrentEntity.ch.length) { + // bad luck we have to resize our buffer + char[] tmp = new char[fCurrentEntity.ch.length << 1]; + System.arraycopy(fCurrentEntity.ch, offset, + tmp, 0, length); + fCurrentEntity.ch = tmp; + } + else { + System.arraycopy(fCurrentEntity.ch, offset, + fCurrentEntity.ch, 0, length); + } + offset = 0; + if (load(length, false)) { + break; + } + } + } + else { + break; } } + while (true); + int length = fCurrentEntity.position - offset; fCurrentEntity.columnNumber += length; @@ -239,9 +293,11 @@ // scan name int offset = fCurrentEntity.position; - if (XML11Char.isXML11NameStart(fCurrentEntity.ch[offset])) { + char ch = fCurrentEntity.ch[offset]; + + if (XML11Char.isXML11NameStart(ch)) { if (++fCurrentEntity.position == fCurrentEntity.count) { - fCurrentEntity.ch[0] = fCurrentEntity.ch[offset]; + fCurrentEntity.ch[0] = ch; offset = 0; if (load(1, false)) { fCurrentEntity.columnNumber++; @@ -249,7 +305,60 @@ return symbol; } } - while (XML11Char.isXML11Name(fCurrentEntity.ch[fCurrentEntity.position])) { + } + else if (XML11Char.isXML11NameHighSurrogate(ch)) { + if (++fCurrentEntity.position == fCurrentEntity.count) { + fCurrentEntity.ch[0] = ch; + offset = 0; + if (load(1, false)) { + --fCurrentEntity.position; + return null; + } + } + char ch2 = fCurrentEntity.ch[fCurrentEntity.position]; + if ( !XMLChar.isLowSurrogate(ch2) || + !XML11Char.isXML11NameStart(XMLChar.supplemental(ch, ch2)) ) { + --fCurrentEntity.position; + return null; + } + if (++fCurrentEntity.position == fCurrentEntity.count) { + fCurrentEntity.ch[0] = ch; + fCurrentEntity.ch[1] = ch2; + offset = 0; + if (load(2, false)) { + fCurrentEntity.columnNumber += 2; + String symbol = fSymbolTable.addSymbol(fCurrentEntity.ch, 0, 2); + return symbol; + } + } + } + else { + return null; + } + + do { + ch = fCurrentEntity.ch[fCurrentEntity.position]; + if (XML11Char.isXML11Name(ch)) { + if (++fCurrentEntity.position == fCurrentEntity.count) { + int length = fCurrentEntity.position - offset; + if (length == fCurrentEntity.ch.length) { + // bad luck we have to resize our buffer + char[] tmp = new char[fCurrentEntity.ch.length << 1]; + System.arraycopy(fCurrentEntity.ch, offset, + tmp, 0, length); + fCurrentEntity.ch = tmp; + } + else { + System.arraycopy(fCurrentEntity.ch, offset, + fCurrentEntity.ch, 0, length); + } + offset = 0; + if (load(length, false)) { + break; + } + } + } + else if (XML11Char.isXML11NameHighSurrogate(ch)) { if (++fCurrentEntity.position == fCurrentEntity.count) { int length = fCurrentEntity.position - offset; if (length == fCurrentEntity.ch.length) { @@ -265,11 +374,41 @@ } offset = 0; if (load(length, false)) { + --fCurrentEntity.position; break; } } + char ch2 = fCurrentEntity.ch[fCurrentEntity.position]; + if ( !XMLChar.isLowSurrogate(ch2) || + !XML11Char.isXML11Name(XMLChar.supplemental(ch, ch2)) ) { + --fCurrentEntity.position; + break; + } + if (++fCurrentEntity.position == fCurrentEntity.count) { + int length = fCurrentEntity.position - offset; + if (length == fCurrentEntity.ch.length) { + // bad luck we have to resize our buffer + char[] tmp = new char[fCurrentEntity.ch.length << 1]; + System.arraycopy(fCurrentEntity.ch, offset, + tmp, 0, length); + fCurrentEntity.ch = tmp; + } + else { + System.arraycopy(fCurrentEntity.ch, offset, + fCurrentEntity.ch, 0, length); + } + offset = 0; + if (load(length, false)) { + break; + } + } + } + else { + break; } } + while (true); + int length = fCurrentEntity.position - offset; fCurrentEntity.columnNumber += length; @@ -307,9 +446,11 @@ // scan name int offset = fCurrentEntity.position; - if (XML11Char.isXML11NCNameStart(fCurrentEntity.ch[offset])) { + char ch = fCurrentEntity.ch[offset]; + + if (XML11Char.isXML11NCNameStart(ch)) { if (++fCurrentEntity.position == fCurrentEntity.count) { - fCurrentEntity.ch[0] = fCurrentEntity.ch[offset]; + fCurrentEntity.ch[0] = ch; offset = 0; if (load(1, false)) { fCurrentEntity.columnNumber++; @@ -317,7 +458,60 @@ return symbol; } } - while (XML11Char.isXML11NCName(fCurrentEntity.ch[fCurrentEntity.position])) { + } + else if (XML11Char.isXML11NameHighSurrogate(ch)) { + if (++fCurrentEntity.position == fCurrentEntity.count) { + fCurrentEntity.ch[0] = ch; + offset = 0; + if (load(1, false)) { + --fCurrentEntity.position; + return null; + } + } + char ch2 = fCurrentEntity.ch[fCurrentEntity.position]; + if ( !XMLChar.isLowSurrogate(ch2) || + !XML11Char.isXML11NCNameStart(XMLChar.supplemental(ch, ch2)) ) { + --fCurrentEntity.position; + return null; + } + if (++fCurrentEntity.position == fCurrentEntity.count) { + fCurrentEntity.ch[0] = ch; + fCurrentEntity.ch[1] = ch2; + offset = 0; + if (load(2, false)) { + fCurrentEntity.columnNumber += 2; + String symbol = fSymbolTable.addSymbol(fCurrentEntity.ch, 0, 2); + return symbol; + } + } + } + else { + return null; + } + + do { + ch = fCurrentEntity.ch[fCurrentEntity.position]; + if (XML11Char.isXML11NCName(ch)) { + if (++fCurrentEntity.position == fCurrentEntity.count) { + int length = fCurrentEntity.position - offset; + if (length == fCurrentEntity.ch.length) { + // bad luck we have to resize our buffer + char[] tmp = new char[fCurrentEntity.ch.length << 1]; + System.arraycopy(fCurrentEntity.ch, offset, + tmp, 0, length); + fCurrentEntity.ch = tmp; + } + else { + System.arraycopy(fCurrentEntity.ch, offset, + fCurrentEntity.ch, 0, length); + } + offset = 0; + if (load(length, false)) { + break; + } + } + } + else if (XML11Char.isXML11NameHighSurrogate(ch)) { if (++fCurrentEntity.position == fCurrentEntity.count) { int length = fCurrentEntity.position - offset; if (length == fCurrentEntity.ch.length) { @@ -333,11 +527,41 @@ } offset = 0; if (load(length, false)) { + --fCurrentEntity.position; break; } } + char ch2 = fCurrentEntity.ch[fCurrentEntity.position]; + if ( !XMLChar.isLowSurrogate(ch2) || + !XML11Char.isXML11NCName(XMLChar.supplemental(ch, ch2)) ) { + --fCurrentEntity.position; + break; + } + if (++fCurrentEntity.position == fCurrentEntity.count) { + int length = fCurrentEntity.position - offset; + if (length == fCurrentEntity.ch.length) { + // bad luck we have to resize our buffer + char[] tmp = new char[fCurrentEntity.ch.length << 1]; + System.arraycopy(fCurrentEntity.ch, offset, + tmp, 0, length); + fCurrentEntity.ch = tmp; + } + else { + System.arraycopy(fCurrentEntity.ch, offset, + fCurrentEntity.ch, 0, length); + } + offset = 0; + if (load(length, false)) { + break; + } + } + } + else { + break; } } + while (true); + int length = fCurrentEntity.position - offset; fCurrentEntity.columnNumber += length; @@ -381,22 +605,57 @@ // scan qualified name int offset = fCurrentEntity.position; - if (XML11Char.isXML11NCNameStart(fCurrentEntity.ch[offset])) { + char ch = fCurrentEntity.ch[offset]; + + if (XML11Char.isXML11NCNameStart(ch)) { if (++fCurrentEntity.position == fCurrentEntity.count) { - fCurrentEntity.ch[0] = fCurrentEntity.ch[offset]; + fCurrentEntity.ch[0] = ch; offset = 0; if (load(1, false)) { fCurrentEntity.columnNumber++; - String name = - fSymbolTable.addSymbol(fCurrentEntity.ch, 0, 1); + String name = fSymbolTable.addSymbol(fCurrentEntity.ch, 0, 1); qname.setValues(null, name, name, null); return true; } } - int index = -1; - while (XML11Char.isXML11Name(fCurrentEntity.ch[fCurrentEntity.position])) { - char c = fCurrentEntity.ch[fCurrentEntity.position]; - if (c == ':') { + } + else if (XML11Char.isXML11NameHighSurrogate(ch)) { + if (++fCurrentEntity.position == fCurrentEntity.count) { + fCurrentEntity.ch[0] = ch; + offset = 0; + if (load(1, false)) { + --fCurrentEntity.position; + return false; + } + } + char ch2 = fCurrentEntity.ch[fCurrentEntity.position]; + if ( !XMLChar.isLowSurrogate(ch2) || + !XML11Char.isXML11NCNameStart(XMLChar.supplemental(ch, ch2)) ) { + --fCurrentEntity.position; + return false; + } + if (++fCurrentEntity.position == fCurrentEntity.count) { + fCurrentEntity.ch[0] = ch; + fCurrentEntity.ch[1] = ch2; + offset = 0; + if (load(2, false)) { + fCurrentEntity.columnNumber += 2; + String name = fSymbolTable.addSymbol(fCurrentEntity.ch, 0, 2); + qname.setValues(null, name, name, null); + return true; + } + } + } + else { + return false; + } + + int index = -1; + boolean sawIncompleteSurrogatePair = false; + do { + ch = fCurrentEntity.ch[fCurrentEntity.position]; + if (XML11Char.isXML11Name(ch)) { + if (ch == ':') { if (index != -1) { break; } @@ -415,44 +674,102 @@ System.arraycopy(fCurrentEntity.ch, offset, fCurrentEntity.ch, 0, length); } + offset = 0; + if (load(length, false)) { + break; + } + } + } + else if (XML11Char.isXML11NameHighSurrogate(ch)) { + if (++fCurrentEntity.position == fCurrentEntity.count) { + int length = fCurrentEntity.position - offset; + if (length == fCurrentEntity.ch.length) { + // bad luck we have to resize our buffer + char[] tmp = new char[fCurrentEntity.ch.length << 1]; + System.arraycopy(fCurrentEntity.ch, offset, + tmp, 0, length); + fCurrentEntity.ch = tmp; + } + else { + System.arraycopy(fCurrentEntity.ch, offset, + fCurrentEntity.ch, 0, length); + } if (index != -1) { - index -= offset; + index = index - offset; } offset = 0; if (load(length, false)) { + sawIncompleteSurrogatePair = true; + --fCurrentEntity.position; break; } } + char ch2 = fCurrentEntity.ch[fCurrentEntity.position]; + if ( !XMLChar.isLowSurrogate(ch2) || + !XML11Char.isXML11Name(XMLChar.supplemental(ch, ch2)) ) { + sawIncompleteSurrogatePair = true; + --fCurrentEntity.position; + break; + } + if (++fCurrentEntity.position == fCurrentEntity.count) { + int length = fCurrentEntity.position - offset; + if (length == fCurrentEntity.ch.length) { + // bad luck we have to resize our buffer + char[] tmp = new char[fCurrentEntity.ch.length << 1]; + System.arraycopy(fCurrentEntity.ch, offset, + tmp, 0, length); + fCurrentEntity.ch = tmp; + } + else { + System.arraycopy(fCurrentEntity.ch, offset, + fCurrentEntity.ch, 0, length); + } + if (index != -1) { + index = index - offset; + } + offset = 0; + if (load(length, false)) { + break; + } + } + } + else { + break; } - int length = fCurrentEntity.position - offset; - fCurrentEntity.columnNumber += length; - if (length > 0) { - String prefix = null; - String localpart = null; - String rawname = fSymbolTable.addSymbol(fCurrentEntity.ch, - offset, length); - if (index != -1) { - int prefixLength = index - offset; - prefix = fSymbolTable.addSymbol(fCurrentEntity.ch, + } + while (true); + + int length = fCurrentEntity.position - offset; + fCurrentEntity.columnNumber += length; + + if (length > 0) { + String prefix = null; + String localpart = null; + String rawname = fSymbolTable.addSymbol(fCurrentEntity.ch, + offset, length); + if (index != -1) { + int prefixLength = index - offset; + prefix = fSymbolTable.addSymbol(fCurrentEntity.ch, offset, prefixLength); - int len = length - prefixLength - 1; - int startLocal = index +1; - if (!XML11Char.isXML11NCNameStart(fCurrentEntity.ch[startLocal])){ - fErrorReporter.reportError(XMLMessageFormatter.XML_DOMAIN, - "IllegalQName", - null, - XMLErrorReporter.SEVERITY_FATAL_ERROR); - } - localpart = fSymbolTable.addSymbol(fCurrentEntity.ch, - index + 1, len); - + int len = length - prefixLength - 1; + int startLocal = index +1; + if (!XML11Char.isXML11NCNameStart(fCurrentEntity.ch[startLocal]) && + (!XML11Char.isXML11NameHighSurrogate(fCurrentEntity.ch[startLocal]) || + sawIncompleteSurrogatePair)){ + fErrorReporter.reportError(XMLMessageFormatter.XML_DOMAIN, + "IllegalQName", + null, + XMLErrorReporter.SEVERITY_FATAL_ERROR); } - else { - localpart = rawname; - } - qname.setValues(prefix, localpart, rawname, null); - return true; + localpart = fSymbolTable.addSymbol(fCurrentEntity.ch, + index + 1, len); + + } + else { + localpart = rawname; } + qname.setValues(prefix, localpart, rawname, null); + return true; } return false; 1.41 +22 -13 xml-xerces/java/src/org/apache/xerces/impl/XMLDocumentFragmentScannerImpl.java Index: XMLDocumentFragmentScannerImpl.java =================================================================== RCS file: /home/cvs/xml-xerces/java/src/org/apache/xerces/impl/XMLDocumentFragmentScannerImpl.java,v retrieving revision 1.40 retrieving revision 1.41 diff -u -r1.40 -r1.41 --- XMLDocumentFragmentScannerImpl.java 7 Nov 2003 00:26:17 -0000 1.40 +++ XMLDocumentFragmentScannerImpl.java 16 Dec 2003 21:37:10 -0000 1.41 @@ -780,7 +780,12 @@ break; } else if (!isValidNameStartChar(c) || !sawSpace) { - reportFatalError("ElementUnterminated", new Object[]{rawname}); + // Second chance. Check if this character is a high + // surrogate of a valid name start character. + if (!isValidNameStartHighSurrogate(c) || !sawSpace) { + reportFatalError("ElementUnterminated", + new Object[] { rawname }); + } } // attributes @@ -1515,9 +1520,17 @@ } case SCANNER_STATE_START_OF_MARKUP: { fMarkupDepth++; - if (fEntityScanner.skipChar('?')) { - setScannerState(SCANNER_STATE_PI); - again = true; + if (fEntityScanner.skipChar('/')) { + if (scanEndElement() == 0) { + if (elementDepthIsZeroHook()) { + return true; + } + } + setScannerState(SCANNER_STATE_CONTENT); + } + else if (isValidNameStartChar(fEntityScanner.peekChar())) { + scanStartElement(); + setScannerState(SCANNER_STATE_CONTENT); } else if (fEntityScanner.skipChar('!')) { if (fEntityScanner.skipChar('-')) { @@ -1537,15 +1550,11 @@ null); } } - else if (fEntityScanner.skipChar('/')) { - if (scanEndElement() == 0) { - if (elementDepthIsZeroHook()) { - return true; - } - } - setScannerState(SCANNER_STATE_CONTENT); + else if (fEntityScanner.skipChar('?')) { + setScannerState(SCANNER_STATE_PI); + again = true; } - else if (isValidNameStartChar(fEntityScanner.peekChar())) { + else if (isValidNameStartHighSurrogate(fEntityScanner.peekChar())) { scanStartElement(); setScannerState(SCANNER_STATE_CONTENT); } 1.40 +11 -2 xml-xerces/java/src/org/apache/xerces/impl/XMLScanner.java Index: XMLScanner.java =================================================================== RCS file: /home/cvs/xml-xerces/java/src/org/apache/xerces/impl/XMLScanner.java,v retrieving revision 1.39 retrieving revision 1.40 diff -u -r1.39 -r1.40 --- XMLScanner.java 18 Nov 2003 18:17:32 -0000 1.39 +++ XMLScanner.java 16 Dec 2003 21:37:10 -0000 1.40 @@ -757,7 +757,7 @@ if (XMLChar.isHighSurrogate(c)) { scanSurrogates(text); } - if (isInvalidLiteral(c)) { + else if (isInvalidLiteral(c)) { reportFatalError("InvalidCharInComment", new Object[] { Integer.toHexString(c) }); fEntityScanner.scanChar(); @@ -951,6 +951,7 @@ } } else if (c != -1 && XMLChar.isHighSurrogate(c)) { + fStringBuffer3.clear(); if (scanSurrogates(fStringBuffer3)) { fStringBuffer.append(fStringBuffer3); if (entityDepth == fEntityDepth) { @@ -1353,6 +1354,14 @@ protected boolean isValidNameStartChar(int value) { return (XMLChar.isNameStart(value)); } // isValidNameStartChar(int): boolean + + // returns true if the given character is + // a valid high surrogate for a nameStartChar + // with respect to the version of XML understood + // by this scanner. + protected boolean isValidNameStartHighSurrogate(int value) { + return false; + } // isValidNameStartHighSurrogate(int): boolean protected boolean versionSupported(String version ) { return version.equals("1.0"); 1.5 +92 -25 xml-xerces/java/src/org/apache/xerces/util/XML11Char.java Index: XML11Char.java =================================================================== RCS file: /home/cvs/xml-xerces/java/src/org/apache/xerces/util/XML11Char.java,v retrieving revision 1.4 retrieving revision 1.5 diff -u -r1.4 -r1.5 --- XML11Char.java 18 Nov 2003 15:39:22 -0000 1.4 +++ XML11Char.java 16 Dec 2003 21:37:11 -0000 1.5 @@ -74,6 +74,7 @@ * @author Andy Clark, IBM * @author Arnaud Le Hors, IBM * @author Neil Graham, IBM + * @author Michael Glavassevich, IBM * * @version $Id$ */ @@ -327,6 +328,18 @@ return (c < 0x10000 && (XML11CHARS[c] & MASK_XML11_NCNAME) != 0) || (0x10000 <= c && c < 0xF0000); } // isXML11NCName(int):boolean + + /** + * Returns whether the given character is a valid + * high surrogate for a name character. This includes + * all high surrogates for characters [0x10000-0xEFFFF]. + * In other words everything excluding planes 15 and 16. + * + * @param c The character to check. + */ + public static boolean isXML11NameHighSurrogate(int c) { + return (0xD800 <= c && c <= 0xDB7F); + } /* * [5] Name ::= NameStartChar NameChar* @@ -339,16 +352,39 @@ * @return true if name is a valid Name */ public static boolean isXML11ValidName(String name) { - if (name.length() == 0) + int length = name.length(); + if (length == 0) return false; + int i = 1; char ch = name.charAt(0); - if( !isXML11NameStart(ch) ) - return false; - for (int i = 1; i < name.length(); i++ ) { - ch = name.charAt(i); - if( ! isXML11Name( ch ) ){ - return false; - } + if( !isXML11NameStart(ch) ) { + if ( length > 1 && isXML11NameHighSurrogate(ch) ) { + char ch2 = name.charAt(1); + if ( !XMLChar.isLowSurrogate(ch2) || + !isXML11NameStart(XMLChar.supplemental(ch, ch2)) ) { + return false; + } + i = 2; + } + else { + return false; + } + } + while (i < length) { + ch = name.charAt(i); + if ( !isXML11Name(ch) ) { + if ( ++i < length && isXML11NameHighSurrogate(ch) ) { + char ch2 = name.charAt(i); + if ( !XMLChar.isLowSurrogate(ch2) || + !isXML11Name(XMLChar.supplemental(ch, ch2)) ) { + return false; + } + } + else { + return false; + } + } + ++i; } return true; } // isXML11ValidName(String):boolean @@ -366,16 +402,39 @@ * @return true if name is a valid NCName */ public static boolean isXML11ValidNCName(String ncName) { - if (ncName.length() == 0) + int length = ncName.length(); + if (length == 0) return false; + int i = 1; char ch = ncName.charAt(0); - if( !isXML11NCNameStart(ch) ) - return false; - for (int i = 1; i < ncName.length(); i++ ) { - ch = ncName.charAt(i); - if( !isXML11NCName( ch ) ){ - return false; - } + if( !isXML11NCNameStart(ch) ) { + if ( length > 1 && isXML11NameHighSurrogate(ch) ) { + char ch2 = ncName.charAt(1); + if ( !XMLChar.isLowSurrogate(ch2) || + !isXML11NCNameStart(XMLChar.supplemental(ch, ch2)) ) { + return false; + } + i = 2; + } + else { + return false; + } + } + while (i < length) { + ch = ncName.charAt(i); + if ( !isXML11NCName(ch) ) { + if ( ++i < length && isXML11NameHighSurrogate(ch) ) { + char ch2 = ncName.charAt(i); + if ( !XMLChar.isLowSurrogate(ch2) || + !isXML11NCName(XMLChar.supplemental(ch, ch2)) ) { + return false; + } + } + else { + return false; + } + } + ++i; } return true; } // isXML11ValidNCName(String):boolean @@ -391,18 +450,26 @@ * @return true if nmtoken is a valid Nmtoken */ public static boolean isXML11ValidNmtoken(String nmtoken) { - if (nmtoken.length() == 0) + int length = nmtoken.length(); + if (length == 0) return false; - for (int i = 0; i < nmtoken.length(); i++ ) { - char ch = nmtoken.charAt(i); - if( ! isXML11Name( ch ) ){ - return false; - } + for (int i = 0; i < length; ++i ) { + char ch = nmtoken.charAt(i); + if( !isXML11Name(ch) ) { + if ( ++i < length && isXML11NameHighSurrogate(ch) ) { + char ch2 = nmtoken.charAt(i); + if ( !XMLChar.isLowSurrogate(ch2) || + !isXML11Name(XMLChar.supplemental(ch, ch2)) ) { + return false; + } + } + else { + return false; + } + } } return true; } // isXML11ValidName(String):boolean - - } // class XML11Char
--------------------------------------------------------------------- To unsubscribe, e-mail: [EMAIL PROTECTED] For additional commands, e-mail: [EMAIL PROTECTED]