This is an automated email from the ASF dual-hosted git repository. andy pushed a commit to branch main in repository https://gitbox.apache.org/repos/asf/jena.git
commit d954ed5a35a66ac68bacaef97c7e472bedb22e31 Author: Andy Seaborne <a...@apache.org> AuthorDate: Sat Jun 28 21:56:55 2025 +0100 GH-3281: TokenizerText inline surrogate checking --- .../org/apache/jena/riot/tokens/TokenizerText.java | 506 ++++++++++++++------- .../apache/jena/riot/tokens/TestTokenizerText.java | 132 +++++- 2 files changed, 458 insertions(+), 180 deletions(-) diff --git a/jena-arq/src/main/java/org/apache/jena/riot/tokens/TokenizerText.java b/jena-arq/src/main/java/org/apache/jena/riot/tokens/TokenizerText.java index 52547b2d50..3da31a4644 100644 --- a/jena-arq/src/main/java/org/apache/jena/riot/tokens/TokenizerText.java +++ b/jena-arq/src/main/java/org/apache/jena/riot/tokens/TokenizerText.java @@ -39,28 +39,108 @@ import org.apache.jena.sparql.ARQInternalErrorException; */ public final class TokenizerText implements Tokenizer { + // This class is performance critical. + + /* ==== Unicode and surrogates. + * + * == chars and ints + * + * Java's char is code unit of UTF-16. Strictly, "code points" are after decoding + * from UTF-16 but for practical purposes a Java char is a code point, but a + * codepoint can also be more than 16 bits, and a codepoint is not a surrogate. + * + * A surrogate pair is a UTF-16 way to encode codepoints beyond U+10FFFF. The + * first (high part of the value) surrogate, is a 16-bit code value in the range + * U+D800 to U+DBFF. The second (low) surrogate is a 16-bit code value in the + * range U+DC00 to U+DFFF. Together these encode U+010000 to U+10FFFF + * + * codepoint = (high - 0xD800)*0x400 + (low - 0xDC00) + 0x010000 + * + * UCS-2 is UTF-16 without surrogates, and so it is limited to U+0000 to U+FFFF. + * + * The code uses int when reading so that the non-codepoint 32bit -1 can be + * returned to indicate end of file. Casting from int to char (java bytecode + * "i2c") is silent truncation, no exception. + * + * Converting char to int is often implicit. It can cast and may be necessary to + * all the right overloaded method/function. e.g. (int)ch for string formatti9ng + * to %X (else char goes to Character object which is incompatible with %X. + * + * == RDF Strings + * + * RDF 1.2 introduces RDF String which is a sequence of Unicode Scalar Values + * after decoding. Scalar values are code points U+0000 to U+10FFFF except for + * surrogates. Surrogates, as pairs or alone, are illegal in RDF Strings. + * + * But Java strings are UTF-16 and use surrogate pairs. Jena needs to allow + * well-formed surrogate pairs in Java strings. + * + * This included bad surrogate pairs written as \-u and \-U escape sequences or + * the rare raw surrogate and Unicode escaped surrogate as a pair. + * + * Jena allows correctly used surrogate pairs because these can occur in the Java + * ecosystem. Jena rejects lone surrogates, or an adjacent pair of surrogates + * that are low-high (the wrong order). + * + * == Blank node labels + * + * Blank node labels exclude surrogates in the grammar (PN_CHARS_BASE) but allow + * code points above U+FFFF so Jena accepts valid surrogate pairs. + */ + + // -- Configuration + + // One of these should be true. + // Normally: + // CHECK_CODEPOINTS = true; + // CHECK_RDFSTRING = false; + + // Default=true. Whether to check for legal codepoint (i.e. only high-low surrogate pairs) while building strings. + private static final boolean CHECK_CODEPOINTS = true; + + // Default=false. Whether to check for legal RDF strings (no ill formed use of surrogates) after building strings. + private static final boolean CHECK_RDFSTRING = false; + + // Default=false. Allow some illegal characters in IRIs (probably causing rejection later when the IRI is parsed). + private static final boolean VeryVeryLaxIRI = false; + + // Default=false. Spaces in IRI are illegal. + private static final boolean AllowSpacesInIRI = false; + + // Controls related to raw use U+FFFD, the Unicode replacement character. + // These are a sign that the input has been corrupted at some point, not + // necessarily the file being read but in the way it was created. + // They do occur in practice. + + // Replacement characters can occur in four places: + // * IRIs -- illegal IRI syntax + // * Strings -- in lexical forms. + // * Prefixed names -- illegal syntax but they end the token so cause a different token. + // * Blank node labels -- illegal syntax but they end the token so cause a different token. + // Note that when the input is ASCII, U+FFFD occurs for non-ASCII characters. + + private final static boolean WarnOnReplacmentCharInIRI = false; + private final static boolean WarnOnReplacmentCharInString = false; + private final static boolean WarnOnReplacmentCharInPrefixedName = true; + private final static boolean WarnOnReplacmentCharInBlankNodeLabel = true; + // The code has the call points for checking tokens but it is generally better to // do the check later in the parsing process. In case a need arises, the code // remains, all compiled away by "if ( false )" (javac does not generate any // bytecodes and even if it it did, JIT will remove dead branches). - private static final boolean CHECKING = false; + private static final boolean CHECKER = false; // Optional checker. private final TokenChecker checker = null; - // Whether to check for legal RDF strings (no ill formed use of surrogates) - private static final boolean CHECK_RDFSTRING = true; - - // Workspace for building token images. - // Reusing a StringBuilder is faster than allocating a fresh one each time. - private final StringBuilder stringBuilder = new StringBuilder(200); + // ---- + // Tokenizer state. // Character source private final PeekReader reader; // Whether whitespace between tokens includes newlines (in various forms). private final boolean singleLineMode; - // The code assumes that errors throw exception and so stop parsing. + // The code assumes that errors throw exceptions and so stop parsing. private final ErrorHandler errorHandler; - private Token token = null; private boolean finished = false; @@ -176,26 +256,27 @@ public final class TokenizerText implements Tokenizer // ---- IRI, unless it's << or <<( // [spc] check is for LT. if ( ch == CH_LT ) { - // Look ahead on char reader.readChar(); + // Look ahead on char int chPeek2 = reader.peekChar(); - if ( chPeek2 != '<' ) { + if ( chPeek2 != CH_LT ) { // '<' not '<<' token.setImage(readIRI()); token.setType(TokenType.IRI); - if ( CHECKING ) + if ( CHECKER ) checkURI(token.getImage()); return token; } reader.readChar(); // '<<' so far - maybe '<<(' int chPeek3 = reader.peekChar(); - if ( chPeek3 != '(' ) { + if ( chPeek3 != CH_LPAREN ) { + // Not '<<(' - it's '<<' token.setType(TokenType.LT2); //token.setImage("<<"); return token; } - // It is <<( + // It is '<<(' reader.readChar(); token.setType(TokenType.L_TRIPLE); //token.setImage("<<("); @@ -250,7 +331,7 @@ public final class TokenizerText implements Tokenizer mainToken.setSubToken1(token); mainToken.setImage2(langTag()); token = mainToken; - if ( CHECKING ) + if ( CHECKER ) checkLiteralLang(token.getImage(), token.getImage2()); } else if ( reader.peekChar() == '^' ) { expect("^^"); @@ -275,11 +356,11 @@ public final class TokenizerText implements Tokenizer mainToken.setType(TokenType.LITERAL_DT); token = mainToken; - if ( CHECKING ) + if ( CHECKER ) checkLiteralDT(token.getImage(), subToken); } else { // Was a simple string. - if ( CHECKING ) + if ( CHECKER ) checkString(token.getImage()); } return token; @@ -292,7 +373,7 @@ public final class TokenizerText implements Tokenizer reader.readChar(); token.setImage(readBlankNodeLabel()); token.setType(TokenType.BNODE); - if ( CHECKING ) checkBlankNode(token.getImage()); + if ( CHECKER ) checkBlankNode(token.getImage()); return token; } token.setType(TokenType.UNDERSCORE); @@ -305,7 +386,7 @@ public final class TokenizerText implements Tokenizer reader.readChar(); token.setType(TokenType.DIRECTIVE); token.setImage(readWord(false)); - if ( CHECKING ) + if ( CHECKER ) checkDirective(token.getImage()); return token; } @@ -316,7 +397,7 @@ public final class TokenizerText implements Tokenizer token.setType(TokenType.VAR); // Character set? token.setImage(readVarName()); - if ( CHECKING ) + if ( CHECKER ) checkVariable(token.getImage()); return token; } @@ -335,7 +416,7 @@ public final class TokenizerText implements Tokenizer reader.pushbackChar(CH_DOT); boolean charactersConsumed = readNumber(CH_ZERO, false); if ( charactersConsumed ) { - if ( CHECKING ) + if ( CHECKER ) checkNumber(token.getImage(), token.getImage2()); return token; } @@ -498,14 +579,14 @@ public final class TokenizerText implements Tokenizer if ( isNewlineChar(ch) ) { //** - If collecting token image. - //** stringBuilder.setLength(0); + //** resetStringBuilder(); // Any number of NL and CR become one "NL" token. do { int ch2 = reader.readChar(); // insertCodepointDirect(stringBuilder,ch2); } while (isNewlineChar(reader.peekChar())); token.setType(TokenType.NL); - //** token.setImage(stringBuilder.toString()); + //** token.setImage(currentString()); return token; } @@ -516,29 +597,143 @@ public final class TokenizerText implements Tokenizer readPrefixedNameOrKeyword(token); - if ( CHECKING ) checkKeyword(token.getImage()); + if ( CHECKER ) checkKeyword(token.getImage()); return token; } - private static final boolean VeryVeryLaxIRI = false; - // Spaces in IRI are illegal. - private static final boolean AllowSpacesInIRI = false; + // ==== Manage the stringBuilder + // Workspace for building token images. + // Reusing a StringBuilder is faster than allocating a fresh one each time. + // It should be possible to rename stringBuilder with no changes to the code anywhere outside these operations.s + private final StringBuilder stringBuilder = new StringBuilder(200); + + private static final int NO_CODEPOINT = '\u0000'; + + // -- Unicode sequences with the possibility of codepoints beyond U+FFFF + /** + * String with the possibility of a unicode surrogate or unicode escape. + * + * Pair with {@link #finishStringU(int)} + */ + private void startStringU() { + stringBuilder.setLength(0); + } + + /** + * Check terminates correctly and return string. + * Pair with {@link #startStringU()} + */ + private String finishStringU(int finalCodepoint) { + if ( finalCodepoint != NO_CODEPOINT ) + fatal("Bad unpaired surrogate at end of string"); + return stringBuilder.toString(); + } + + // -- Strings without possible unicode surrogates + + /** + * String with the possibility of a unicode surrogate or unicode escape. + * + * Pair with {@link #finishStringU(int)} + */ + private void startStringNU() { + stringBuilder.setLength(0); + } + + /** + * End processing a string. + * Pair with {@link #startStringU()} + */ + private String finishStringNU() { + return stringBuilder.toString(); + } + + private int lengthStringBuilder() { return stringBuilder.length(); } + private void setStringBuilderLength(int x) { stringBuilder.setLength(x); } + + private char charAt(int idx) { return stringBuilder.charAt(idx); } + private void deleteCharAt(int idx) { stringBuilder.deleteCharAt(idx); } + + /** Insert codepoint. */ + private int insertCodepoint(int previousCP,int ch) { + if ( Character.charCount(ch) == 1 ) { + char ch16 = (char)ch; // Safe, not truncating, because count = 1 + char rtn = 0; + if ( CHECK_CODEPOINTS ) + rtn = checkCodepoint((char)previousCP, ch16); + insertCodepointDirect(ch16); + return rtn; + } else { + // Surrogate waiting? + if ( CHECK_CODEPOINTS && (previousCP != NO_CODEPOINT) ) + fatal("Lone surrogate"); + if ( !Character.isDefined(ch) && !Character.isSupplementaryCodePoint(ch) ) + fatal("Illegal codepoint: 0x%04X", ch); + // Only legal surrogate pairs at this point. + char[] chars = Character.toChars(ch); + stringBuilder.append(chars); + return NO_CODEPOINT; + } + } + +// // Only high then low is allowed. +// // Casting int to char is a 16 bit silent truncation (bytecode "i2c"). + private char checkCodepoint(char previousCP, char ch) { + if ( ! Character.isSurrogate(ch) ) { + if ( previousCP == NO_CODEPOINT ) + return NO_CODEPOINT; + fatal("Bad surrogate (high surrogate not followed by a low surrogate): 0x%04X", (int)previousCP); + } + // Surrogate. + if ( previousCP == NO_CODEPOINT ) { // Effectively: is previousCodePoint a high surrogate? + if ( Character.isHighSurrogate(ch) ) { + // Park it + return ch; + } + fatal("Bad surrogate (low surrogate not preceded by a high surrogate): 0x%04X", (int)ch); + } + // previousCodePoint != NO_CODEPOINT + // Previous is a high surrogate + + if ( Character.isLowSurrogate(ch) ) { + // high-low -- OK! Clear previous. + return NO_CODEPOINT; + } + fatal("Bad surrogate (high surrogate not followed by low surrogate): 0x%04X", (int)previousCP); + return NO_CODEPOINT; + } + + // Insert codepoint, knowing that 'ch' is 16 bit and not a surrogate. + private void insertCodepointDirect(int ch) { + insertCodepointDirect((char)ch); + } + + /** Insert codepoint, knowing that 'ch' is not a surrogate. */ + private void insertCodepointDirect(char ch) { + stringBuilder.append(ch); + } + + /** Snapshot (unchecked) string builder - for error messages. */ + private String currentString() { return stringBuilder.toString(); } + + // ==== // [8] IRIREF ::= '<' ([^#x00-#x20<>"{}|^`\] | UCHAR)* '>' private String readIRI() { - stringBuilder.setLength(0); + startStringU(); + int prevCP = NO_CODEPOINT; for (;;) { int ch = reader.readChar(); switch(ch) { case EOF: fatal("Broken IRI (End of file)"); return null; case NL: - fatal("Broken IRI (newline): %s", stringBuilder.toString()); return null; + fatal("Broken IRI (newline): %s", currentString()); return null; case CR: - fatal("Broken IRI (CR): %s", stringBuilder.toString()); return null; + fatal("Broken IRI (CR): %s", currentString()); return null; case CH_GT: // Done! - String str = stringBuilder.toString(); + String str = finishStringU(prevCP); if ( CHECK_RDFSTRING ) checkRDFString(str); return str; @@ -549,18 +744,18 @@ public final class TokenizerText implements Tokenizer break; case CH_LT: // Probably a corrupt file so treat as fatal. - fatal("Bad character in IRI (bad character: '<'): <%s[<]...>", stringBuilder.toString()); return null; + fatal("Bad character in IRI (bad character: '<'): <%s[<]...>", currentString()); return null; case TAB: - error("Bad character in IRI (tab character): <%s[tab]...>", stringBuilder.toString()); break; + error("Bad character in IRI (tab character): <%s[tab]...>", currentString()); break; case '{': case '}': case '"': case '|': case '^': case '`' : if ( ! VeryVeryLaxIRI ) - warning("Illegal character in IRI (codepoint U+%04X, '%c'): <%s[%c]...>", ch, (char)ch, stringBuilder.toString(), (char)ch); + warning("Illegal character in IRI (codepoint U+%04X, '%c'): <%s[%c]...>", ch, (char)ch, currentString(), (char)ch); break; case SPC: if ( ! AllowSpacesInIRI ) - error("Bad character in IRI (space): <%s[space]...>", stringBuilder.toString()); + error("Bad character in IRI (space): <%s[space]...>", currentString()); else - warning("Bad character in IRI (space): <%s[space]...>", stringBuilder.toString()); + warning("Bad character in IRI (space): <%s[space]...>", currentString()); break; case REPLACEMENT: if ( WarnOnReplacmentCharInIRI ) @@ -568,9 +763,9 @@ public final class TokenizerText implements Tokenizer break; default: if ( ch <= 0x19 ) - warning("Illegal character in IRI (control char 0x%02X): <%s[0x%02X]...>", ch, stringBuilder.toString(), ch); + warning("Illegal character in IRI (control char 0x%02X): <%s[0x%02X]...>", ch, currentString(), ch); } - insertCodepoint(stringBuilder, ch); + prevCP = insertCodepoint(prevCP, ch); } } @@ -585,7 +780,7 @@ public final class TokenizerText implements Tokenizer token.setType(TokenType.PREFIXED_NAME); String ln = readLocalPart(); // Local part token.setImage2(ln); - if ( CHECKING ) + if ( CHECKER ) checkPrefixedName(token.getImage(), token.getImage2()); } @@ -594,7 +789,7 @@ public final class TokenizerText implements Tokenizer if ( posn == reader.getPosition() ) fatal("Failed to find a prefix name or keyword: %c(%d;0x%04X)", ch, ch, ch); - if ( CHECKING ) + if ( CHECKER ) checkKeyword(token.getImage()); } @@ -628,22 +823,6 @@ public final class TokenizerText implements Tokenizer return readSegment(true); } - // Controls related to raw use U+FFFD, the Unicode replacement character. - // These are a sign that the input has been corrupted at some point, not necessarily the file being read but also in the way it was created, - // They do occur in practice. - - // Replacement characters can occur in four places: - // * IRIs -- illegal IRI syntax - // * Strings -- in lexical forms. - // * Prefixed names -- illegal syntax but they end the token so cause a different token. - // * Blank node labels -- illegal syntax but they end the token so cause a different token. - // Note that when ASCII input, U+FFFD occurs for non-ASCII characters. - - private final static boolean WarnOnReplacmentCharInIRI = false; - private final static boolean WarnOnReplacmentCharInString = false; - private final static boolean WarnOnReplacmentCharInPrefixedName = true; - private final static boolean WarnOnReplacmentCharInBlankNodeLabel = true; - // Read the prefix or localname part of a prefixed name. // Returns "" when there are no valid characters, e.g. prefix for ":foo" or local name for "ex:". private String readSegment(boolean isLocalPart) { @@ -652,34 +831,43 @@ public final class TokenizerText implements Tokenizer // PN_CHARS_U is PN_CHARS_BASE and '_' // RiotChars has isPNChars_U_N for ( PN_CHARS_U | [0-9] ) - stringBuilder.setLength(0); + + int prevCP = NO_CODEPOINT; // -- Test first character int ch = reader.peekChar(); if ( ch == EOF ) return ""; + + startStringU(); + if ( isLocalPart ) { if ( ch == CH_COLON ) { reader.readChar(); - insertCodepoint(stringBuilder, ch); + prevCP = insertCodepoint(prevCP, ch); } else if ( ch == CH_PERCENT || ch == CH_RSLASH ) { // processPLX // read % or \ reader.readChar(); processPLX(ch); + // prevCP = NO_CODEPOINT; } else if ( RiotChars.isPNChars_U_N(ch) ) { if ( WarnOnReplacmentCharInPrefixedName ) { if ( ch == REPLACEMENT ) warning("Unicode replacement character U+FFFD in prefixed name"); } - insertCodepoint(stringBuilder, ch); + prevCP = insertCodepoint(prevCP, ch); reader.readChar(); - } else + } else { + finishStringU(prevCP); return ""; + } } else { - if ( !RiotChars.isPNCharsBase(ch) ) + if ( !RiotChars.isPNCharsBase(ch) ) { + finishStringU(prevCP); return ""; - insertCodepoint(stringBuilder, ch); + } + prevCP = insertCodepoint(prevCP, ch); reader.readChar(); } // Done first character @@ -688,12 +876,12 @@ public final class TokenizerText implements Tokenizer for (;;) { ch = reader.peekChar(); boolean valid = false; - if ( isLocalPart && (ch == CH_PERCENT || ch == CH_RSLASH) ) { reader.readChar(); if ( chDot != 0 ) - insertCodepointDirect(stringBuilder, chDot); + insertCodepointDirect(chDot); processPLX(ch); + prevCP = NO_CODEPOINT; chDot = 0; continue; } @@ -712,9 +900,9 @@ public final class TokenizerText implements Tokenizer break; // Exit loop // Valid character. - // Was there also a DOT previous loop? + // Was there also a DOT in the previous loop? if ( chDot != 0 ) { - insertCodepointDirect(stringBuilder, chDot); + insertCodepointDirect(chDot); chDot = 0; } @@ -723,7 +911,7 @@ public final class TokenizerText implements Tokenizer if ( ch == REPLACEMENT ) warning("Unicode replacement character U+FFFD in prefixed name"); } - insertCodepoint(stringBuilder, ch); + prevCP = insertCodepoint(prevCP, ch); } else { // DOT - delay until next loop. chDot = ch; @@ -736,28 +924,27 @@ public final class TokenizerText implements Tokenizer if ( chDot == CH_DOT ) // Unread it. reader.pushbackChar(chDot); - return stringBuilder.toString(); + return finishStringU(prevCP); } // Process PLX (percent or character escape for a prefixed name) private void processPLX(int ch) { if ( ch == CH_PERCENT ) { - insertCodepointDirect(stringBuilder, ch); - + insertCodepointDirect(ch); ch = reader.peekChar(); if ( !isHexChar(ch) ) fatal("Not a hex character: '%c'", ch); - insertCodepointDirect(stringBuilder, ch); + insertCodepointDirect(ch); reader.readChar(); ch = reader.peekChar(); if ( !isHexChar(ch) ) fatal("Not a hex character: '%c'", ch); - insertCodepointDirect(stringBuilder, ch); + insertCodepointDirect(ch); reader.readChar(); } else if ( ch == CH_RSLASH ) { - ch = readCharEscape(); - insertCodepoint(stringBuilder, ch); + ch = readCharEscape(); // Does not allow Unicode escapes. + insertCodepointDirect(ch); } else throw new ARQInternalErrorException("Not a '\\' or a '%' character"); } @@ -775,19 +962,19 @@ public final class TokenizerText implements Tokenizer char ch = string.charAt(i); if ( ! Character.isValidCodePoint(ch) ) - warning("Illegal code point in \\U sequence value: 0x%08X", ch); + warning("Illegal code point in \\U sequence value: 0x%08X", (int)ch); // Check surrogate pairs are pairs. if ( Character.isHighSurrogate(ch) ) { i++; if ( i == string.length() ) - fatal("Bad surrogate pair (end of string)"); + fatal("Bad surrogate pair (end of string):0x%04X", (int)ch); char ch1 = string.charAt(i); if ( ! Character.isLowSurrogate(ch1) ) { - fatal("Bad surrogate pair (high surrogate not followed by low surrogate)"); + fatal("Bad surrogate (high surrogate not followed by a low surrogate): 0x%04X", (int)ch1); } } else if ( Character.isLowSurrogate(ch) ) { - fatal("Bad surrogate pair (low surrogate not preceded by a high surrogate)"); + fatal("Bad surrogate pair (low surrogate not preceded by a high surrogate): 0x%04X", (int)ch); } } } @@ -797,8 +984,9 @@ public final class TokenizerText implements Tokenizer private String readStringQuote1(int startCh, int endCh) { // Assumes the 1 character starting delimiter has been read. // Reads the terminating delimiter. - stringBuilder.setLength(0); + startStringU(); + int prevCP = NO_CODEPOINT; for (;;) { int ch = reader.readChar(); if ( WarnOnReplacmentCharInString ) { @@ -809,33 +997,34 @@ public final class TokenizerText implements Tokenizer if ( ch == NotACharacter || ch == ReverseOrderBOM ) warning("Unicode non-character U+%04X in string", ch); if ( ch == EOF ) - fatal("Broken token: %s", stringBuilder.toString()); + fatal("Broken token: %s", currentString()); else if ( ch == endCh ) { // Done! - String str = stringBuilder.toString(); + String str = finishStringU(prevCP); if ( CHECK_RDFSTRING ) checkRDFString(str); return str; } else if ( ch == NL ) - fatal("Broken token (newline in string)", stringBuilder.toString()); + fatal("Broken token (newline in string)", currentString()); else if ( ch == CR ) - fatal("Broken token (carriage return in string)", stringBuilder.toString()); + fatal("Broken token (carriage return in string)", currentString()); // Legal in Turtle/N-Triples - maybe warn? // else if ( ch == FF ) -// warning("Bad token (form feed in string)", stringBuilder.toString()); +// warning("Bad token (form feed in string)", currentString()); // else if ( ch == VT ) -// fatal("Bad token (vertical tab in string)", stringBuilder.toString()); +// fatal("Bad token (vertical tab in string)", currentString()); else if ( ch == CH_RSLASH ) // Allow escaped replacement character. ch = readLiteralEscape(); - insertCodepoint(stringBuilder, ch); + prevCP = insertCodepoint(prevCP, ch); } } private String readStringQuote3(int quoteChar) { // Assumes the 3 character starting delimiter has been read. // Reads the terminating delimiter. - stringBuilder.setLength(0); + startStringU(); + int prevCP = NO_CODEPOINT; for (;;) { int ch = reader.readChar(); if ( WarnOnReplacmentCharInString ) { @@ -847,7 +1036,7 @@ public final class TokenizerText implements Tokenizer fatal("Broken long string"); } else if ( ch == quoteChar ) { if ( threeQuotes(quoteChar) ) { - String str = stringBuilder.toString(); + String str = finishStringU(prevCP); if ( CHECK_RDFSTRING ) checkRDFString(str); return str; @@ -855,12 +1044,13 @@ public final class TokenizerText implements Tokenizer // quote, not triple. It is a normal character. } else if ( ch == CH_RSLASH ) ch = readLiteralEscape(); - insertCodepoint(stringBuilder, ch); + prevCP = insertCodepoint(prevCP, ch); } } - private String readWord(boolean leadingDigitAllowed) - { return readWordSub(leadingDigitAllowed, false); } + private String readWord(boolean leadingDigitAllowed) { + return readWordSub(leadingDigitAllowed, false); + } // A 'word' is used in several places: // keyword @@ -885,10 +1075,9 @@ public final class TokenizerText implements Tokenizer return readCharsWithExtras(true, true, extraCharsVar, true); } - // See also readBlankNodeLabel - private String readCharsWithExtras(boolean leadingDigitAllowed, boolean leadingSignAllowed, char[] extraChars, boolean allowFinalDot) { - stringBuilder.setLength(0); + // No unicode escapes. + startStringNU(); int idx = 0; if ( !leadingDigitAllowed ) { int ch = reader.peekChar(); @@ -908,7 +1097,7 @@ public final class TokenizerText implements Tokenizer if ( isAlphaNumeric(ch) || Chars.charInArray(ch, extraChars) ) { reader.readChar(); - insertCodepointDirect(stringBuilder, ch); + insertCodepointDirect(ch); continue; } else // Inappropriate character. @@ -919,20 +1108,22 @@ public final class TokenizerText implements Tokenizer if ( !allowFinalDot ) { // BAD : assumes pushbackChar is infinite. // Check is ends in "." - while (idx > 0 && stringBuilder.charAt(idx - 1) == CH_DOT) { + while (idx > 0 && charAt(idx - 1) == CH_DOT) { // Push back the dot. reader.pushbackChar(CH_DOT); - stringBuilder.setLength(idx - 1); idx--; + setStringBuilderLength(idx); } } - return stringBuilder.toString(); + return finishStringNU(); } // BLANK_NODE_LABEL ::= '_:' (PN_CHARS_U | [0-9]) ((PN_CHARS | '.')* PN_CHARS)? private String readBlankNodeLabel() { - stringBuilder.setLength(0); + startStringU(); + + int prevCP = NO_CODEPOINT; // First character. { int ch = reader.peekChar(); @@ -948,7 +1139,7 @@ public final class TokenizerText implements Tokenizer if ( ch == REPLACEMENT ) warning("Unicode replacement character U+FFFD in blank node label"); } - insertCodepoint(stringBuilder, ch); + prevCP = insertCodepoint(prevCP, ch); } // Remainder. DOT can't be last so do a delay on that. @@ -966,7 +1157,8 @@ public final class TokenizerText implements Tokenizer reader.readChar(); if ( chDot != 0 ) { - insertCodepointDirect(stringBuilder, chDot); + insertCodepointDirect(chDot); + prevCP = NO_CODEPOINT; chDot = 0; } @@ -976,7 +1168,7 @@ public final class TokenizerText implements Tokenizer if ( ch == REPLACEMENT ) warning("Unicode replacement character U+FFFD in blank node label"); } - insertCodepoint(stringBuilder, ch); + prevCP = insertCodepoint(prevCP, ch); } else // DOT - delay until next loop. chDot = ch; @@ -988,7 +1180,7 @@ public final class TokenizerText implements Tokenizer // if ( ! seen ) // exception("Blank node label missing"); - return stringBuilder.toString(); + return finishStringU(prevCP); } /* @@ -1020,18 +1212,18 @@ public final class TokenizerText implements Tokenizer int numDigitsBeforeDP = 0; int numDigitsAfterDP = 0; - stringBuilder.setLength(0); + startStringNU(); if ( initialChar != CH_ZERO ) { // char U+0000 if ( initialChar == CH_PLUS || initialChar == CH_MINUS ) - insertCodepointDirect(stringBuilder, initialChar); + insertCodepointDirect(initialChar); else if ( isDigit ) { - insertCodepointDirect(stringBuilder, initialChar); + insertCodepointDirect(initialChar); numDigitsBeforeDP = 1; } } int ch = reader.peekChar(); - numDigitsBeforeDP += readDigits(stringBuilder); + numDigitsBeforeDP += readDigits(); if ( numDigitsBeforeDP > 0 ) hasDigitsBeforeDot = true; @@ -1039,9 +1231,9 @@ public final class TokenizerText implements Tokenizer ch = reader.peekChar(); if ( ch == CH_DOT ) { reader.readChar(); - stringBuilder.append(CH_DOT); + insertCodepointDirect(CH_DOT); hasDecimalPoint = true; - numDigitsAfterDP += readDigits(stringBuilder); + numDigitsAfterDP += readDigits(); if ( numDigitsAfterDP > 0 ) hasDigitsAfterDot = true; } @@ -1053,32 +1245,26 @@ public final class TokenizerText implements Tokenizer if ( ! hasDigitsBeforeDot & ! hasDigitsAfterDot ) { // The number/significand/mantissa is exactly '.' - // Don't do anything - there might be a preceeding sign. + // Don't do anything - there might be a preceding sign. if ( hasDecimalPoint ) reader.pushbackChar(CH_DOT); return false; } - if ( exponent(stringBuilder) ) { + if ( exponent() ) { isDouble = true; } else { // Final part - "decimal" 123. is an integer 123 and a DOT. if ( hasDecimalPoint && ! hasDigitsAfterDot ) { - int N = stringBuilder.length(); - stringBuilder.deleteCharAt(N-1); // A DOT + int N = lengthStringBuilder(); // Reject the DOT which will be picked up next time. + deleteCharAt(N-1); reader.pushbackChar(CH_DOT); hasDecimalPoint = false; -// int len = stringBuilder.length(); -// if ( stringBuilder.charAt(len - 1) == CH_DOT ) { -// stringBuilder.setLength(len - 1); -// reader.pushbackChar(CH_DOT); -// hasDecimalPoint = false; -// } } } - token.setImage(stringBuilder.toString()); + token.setImage(finishStringNU()); if ( isDouble ) token.setType(TokenType.DOUBLE); else if ( hasDecimalPoint ) @@ -1095,49 +1281,49 @@ public final class TokenizerText implements Tokenizer return false; // It's HEX reader.readChar(); - stringBuilder.setLength(0); - insertCodepointDirect(stringBuilder, '0'); - insertCodepointDirect(stringBuilder, ch2); + startStringNU(); + insertCodepointDirect('0'); + insertCodepointDirect(ch2); // Error if no hex digits. - readHex(reader, stringBuilder); - token.setImage(stringBuilder.toString()); + readHex(reader); + token.setImage(finishStringNU()); token.setType(TokenType.HEX); return true; } - private void readHex(PeekReader reader, StringBuilder sb) { - // Just after the 0x, which are in sb + private void readHex(PeekReader reader) { + // Just after the 0x, which are in string builder. int x = 0; for (;;) { int ch = reader.peekChar(); if ( !isHexChar(ch) ) break; reader.readChar(); - insertCodepointDirect(sb, ch); + insertCodepointDirect(ch); x++; } if ( x == 0 ) - fatal("No hex characters after %s", sb.toString()); + fatal("No hex characters after %s", currentString()); } - private int readDigits(StringBuilder buffer) { + private int readDigits() { int count = 0; for (;;) { int ch = reader.peekChar(); if ( !range(ch, '0', '9') ) break; reader.readChar(); - insertCodepointDirect(buffer, ch); + insertCodepointDirect(ch); count++; } return count; } - private void readPossibleSign(StringBuilder sb) { + private void readPossibleSign() { int ch = reader.peekChar(); if ( ch == '-' || ch == '+' ) { reader.readChar(); - insertCodepointDirect(sb, ch); + insertCodepointDirect(ch); } } @@ -1166,23 +1352,23 @@ public final class TokenizerText implements Tokenizer return true; } - private boolean exponent(StringBuilder sb) { + private boolean exponent() { int ch = reader.peekChar(); if ( ch != 'e' && ch != 'E' ) return false; reader.readChar(); - insertCodepointDirect(sb, ch); - readPossibleSign(sb); - int x = readDigits(sb); + insertCodepointDirect(ch); + readPossibleSign(); + int x = readDigits(); if ( x == 0 ) - fatal("Malformed double: %s", sb); + fatal("Malformed double: %s", currentString()); return true; } private String langTag() { - stringBuilder.setLength(0); - a2z(stringBuilder); - if ( stringBuilder.length() == 0 ) + startStringU(); + a2z(); + if ( lengthStringBuilder() == 0 ) fatal("Bad language tag"); boolean seenTextDirection = false; @@ -1193,65 +1379,47 @@ public final class TokenizerText implements Tokenizer if ( seenTextDirection ) fatal("Bad language tag with base direction"); reader.readChar(); - insertCodepointDirect(stringBuilder, ch); + insertCodepointDirect(ch); int ch2 = reader.peekChar(); if ( ch2 == '-' ) { reader.readChar(); // base direction - insertCodepointDirect(stringBuilder, ch2); + insertCodepointDirect(ch2); seenTextDirection = true; } - int x = stringBuilder.length(); - a2zN(stringBuilder); - if ( stringBuilder.length() == x ) + int x = lengthStringBuilder(); + a2zN(); + if ( lengthStringBuilder() == x ) fatal("Bad language tag"); } else break; } - return stringBuilder.toString().intern(); + return finishStringU(NO_CODEPOINT).intern(); } // ASCII-only e.g. in lang tags. - private void a2z(StringBuilder sBuff) { + private void a2z() { for (;;) { int ch = reader.peekChar(); if ( isA2Z(ch) ) { reader.readChar(); - insertCodepointDirect(sBuff, ch); + insertCodepointDirect(ch); } else return; } } - private void a2zN(StringBuilder sBuff) { + private void a2zN() { for (;;) { int ch = reader.peekChar(); if ( isA2ZN(ch) ) { reader.readChar(); - insertCodepointDirect(sBuff, ch); + insertCodepointDirect(ch); } else return; } } - private void insertCodepoint(StringBuilder buffer, int ch) { - if ( Character.charCount(ch) == 1 ) - insertCodepointDirect(buffer, ch); - else { - // Convert to UTF-16. Note that the rest of any system this is used - // in must also respect codepoints and surrogate pairs. - if ( !Character.isDefined(ch) && !Character.isSupplementaryCodePoint(ch) ) - fatal("Illegal codepoint: 0x%04X", ch); - char[] chars = Character.toChars(ch); - buffer.append(chars); - } - } - - // Insert code point, knowing that 'ch' is 16 bit (basic plane) - private static void insertCodepointDirect(StringBuilder buffer, int ch) { - buffer.append((char)ch); - } - @Override public long getColumn() { return reader.getColNum(); diff --git a/jena-arq/src/test/java/org/apache/jena/riot/tokens/TestTokenizerText.java b/jena-arq/src/test/java/org/apache/jena/riot/tokens/TestTokenizerText.java index bc2226a97a..d2fec64254 100644 --- a/jena-arq/src/test/java/org/apache/jena/riot/tokens/TestTokenizerText.java +++ b/jena-arq/src/test/java/org/apache/jena/riot/tokens/TestTokenizerText.java @@ -1356,46 +1356,140 @@ public class TestTokenizerText { // U+DC00-U+DFFF is a low surrogate (second part of a pair) // so D800-DC00 is legal. - @Test public void turtle_surrogate_pair_01() { + @Test public void turtle_surrogate_pair_esc_esc_01() { // escaped high, escaped low surrogate("'\\ud800\\udc00'"); } - @Test public void turtle_surrogate_pair_02() { + @Test public void turtle_surrogate_pair_esc_esc_02() { + // escaped high, escaped low + surrogate("'''\\ud800\\udc00'''"); + } + + @Test public void turtle_surrogate_pair_esc_esc_03() { + // escaped high, escaped low + surrogate("<\\ud800\\udc00>"); + } + + @Test public void turtle_surrogate_pair_esc_raw_01() { // escaped high, raw low surrogate("'\\ud800\udc00'"); } + @Test public void turtle_surrogate_pair_esc_raw_02() { + // escaped high, raw low + surrogate("'''\\ud800\udc00'''"); + } + @Test public void turtle_surrogate_pair_esc_raw_03() { + // escaped high, raw low + surrogate("<\\ud800\udc00>"); + } + @Test public void turtle_surrogate_pair_esc_raw_04() { + // escaped high, raw low + surrogate("_:b\ud800\udc00"); + } + // Compilation failure - illegal escape character -// @Test public void turtle_surrogate_pair_03() { +// @Test public void turtle_surrogate_pair_raw_esc_01() { // // raw high, escaped low // surrogate("'\ud800\\udc00'"); // } - @Test public void turtle_surrogate_pair_04() { + @Test public void turtle_surrogate_pair_raw_raw_01() { // raw high, raw low surrogate("'\ud800\udc00'"); } - @Test public void turtle_surrogate_pair_05() { + @Test public void turtle_surrogate_pair_raw_raw_02() { + // raw high, raw low + surrogate("'''\ud800\udc00'''"); + } + + @Test public void turtle_surrogate_pair_raw_raw_03() { + // raw high, raw low + surrogate("<\ud800\udc00>"); + } + + // Blank nodes label allow unicode but not unicode escapes. + @Test public void turtle_surrogate_pair_raw_raw_04() { + // raw high, raw low + surrogate("_:b\ud800\udc00"); + } + + @Test public void turtle_surrogate_pair_raw_raw_05() { + // escaped high, escaped low + surrogate("ns:\ud800\udc00"); + } + + @Test public void turtle_surrogate_pair_raw_raw_06() { + // escaped high, escaped low + surrogate("\ud800\udc00:local"); + } + + @Test public void turtle_surrogate_pair_esc_esc_internal_01() { // escaped high, escaped low surrogate("'a\\ud800\\udc00x'"); } - @Test public void turtle_surrogate_pair_06() { + @Test public void turtle_surrogate_pair_esc_esc_internal_02() { + // escaped high, escaped low + surrogate("'''a\\ud800\\udc00x'''"); + } + + @Test public void turtle_surrogate_pair_esc_esc_internal_03() { + // escaped high, escaped low + surrogate("<a\\ud800\\udc00x>"); + } + + @Test public void turtle_surrogate_pair_esc_raw_internal_01() { // escaped high, raw low - surrogate("'z\\ud800\udc00'z"); + surrogate("'z\\ud800\udc00z'"); + } + + @Test public void turtle_surrogate_pair_esc_raw_internal_02() { + // escaped high, raw low + surrogate("'''z\\ud800\udc00z'''"); + } + + @Test public void turtle_surrogate_pair_esc_raw_internal_03() { + // escaped high, raw low + surrogate("<z\\ud800\udc00z>"); } // Compilation failure - illegal escape character -// @Test public void turtle_surrogate_pair_07() { +// @Test public void turtle_surrogate_pair_raw_esc() { // // raw high, escaped low // surrogate("'a\ud800\\udc00'z"); // } - @Test public void turtle_surrogate_pair_08() { + @Test public void turtle_surrogate_pair_raw_raw_internal_01() { // raw high, raw low - surrogate("'a\ud800\udc00'z"); + surrogate("'a\ud800\udc00z'"); + } + + @Test public void turtle_surrogate_pair_raw_raw_internal_02() { + // raw high, raw low + surrogate("'''a\ud800\udc00z'''"); + } + + @Test public void turtle_surrogate_pair_raw_raw_internal_03() { + // raw high, raw low + surrogate("<a\ud800\udc00z>"); + } + + @Test public void turtle_surrogate_pair_raw_raw_internal_04() { + // raw high, raw low + surrogate("_:ba\ud800\udc00z"); + } + + @Test public void turtle_surrogate_pair_raw_raw__internal05() { + // escaped high, escaped low + surrogate("ns:x\ud800\udc00y"); + } + + @Test public void turtle_surrogate_pair_raw_raw__internal06() { + // escaped high, escaped low + surrogate("x\ud800\udc00y:local"); } @Test (expected=RiotParseException.class) @@ -1403,11 +1497,26 @@ public class TestTokenizerText { surrogate("'\\ud800'"); } + @Test (expected=RiotParseException.class) + public void turtle_bad_surrogate_01a() { + surrogate("'''\\ud800''''"); + } + @Test (expected=RiotParseException.class) public void turtle_bad_surrogate_02() { surrogate("'a\\ud800z'"); } + @Test (expected=RiotParseException.class) + public void turtle_bad_surrogate_02a() { + surrogate("'''a\\ud800z'''"); + } + + @Test (expected=RiotParseException.class) + public void turtle_bad_surrogate_02b() { + surrogate("<a\\ud800z>"); + } + @Test (expected=RiotParseException.class) public void turtle_bad_surrogate_03() { surrogate("'\\udfff'"); @@ -1477,8 +1586,9 @@ public class TestTokenizerText { private void surrogate(String string) { Tokenizer tokenizer = tokenizer(string); - tokenizer.hasNext(); + assertTrue(tokenizer.hasNext()); tokenizer.next(); + assertFalse(tokenizer.hasNext()); } @Test