This is an automated email from the ASF dual-hosted git repository. andy pushed a commit to branch main in repository https://gitbox.apache.org/repos/asf/jena.git
commit 76b92850626afba52b2a075761c162aa3ac41412 Author: Andy Seaborne <[email protected]> AuthorDate: Sun Oct 13 17:18:18 2024 +0100 GH-2766: Allow surrogates in IRIs as Java strings --- .../org/apache/jena/riot/tokens/TokenizerText.java | 26 ------------ .../apache/jena/riot/tokens/TestTokenizerText.java | 46 +++++++++++----------- .../main/java/org/apache/jena/irix/Chars3986.java | 10 ++++- 3 files changed, 30 insertions(+), 52 deletions(-) diff --git a/jena-arq/src/main/java/org/apache/jena/riot/tokens/TokenizerText.java b/jena-arq/src/main/java/org/apache/jena/riot/tokens/TokenizerText.java index 90767fe044..bc7de737c2 100644 --- a/jena-arq/src/main/java/org/apache/jena/riot/tokens/TokenizerText.java +++ b/jena-arq/src/main/java/org/apache/jena/riot/tokens/TokenizerText.java @@ -573,37 +573,11 @@ public final class TokenizerText implements Tokenizer default: if ( ch <= 0x19 ) warning("Illegal character in IRI (control char 0x%02X): <%s[0x%02X]...>", ch, stringBuilder.toString(), ch); - } - // JENA-1924: jena-iri does not catch this. - if ( ! VeryVeryLaxIRI && ch >= 0xA0 && ! isUcsChar(ch) ) - warning("Illegal character in IRI (Not a ucschar: 0x%04X): <%s[U+%04X]...>", ch, stringBuilder.toString(), ch); insertCodepoint(stringBuilder, ch); } } - private static boolean isUcsChar(int ch) { - // RFC 3987 - // ucschar = %xA0-D7FF / %xF900-FDCF / %xFDF0-FFEF - // / %x10000-1FFFD / %x20000-2FFFD / %x30000-3FFFD - // / %x40000-4FFFD / %x50000-5FFFD / %x60000-6FFFD - // / %x70000-7FFFD / %x80000-8FFFD / %x90000-9FFFD - // / %xA0000-AFFFD / %xB0000-BFFFD / %xC0000-CFFFD - // / %xD0000-DFFFD / %xE1000-EFFFD - boolean b = range(ch, 0xA0, 0xD7FF) || range(ch, 0xF900, 0xFDCF) || range(ch, 0xFDF0, 0xFFEF); - if ( b ) - return true; - if ( ch < 0x1000 ) - return false; - // 32 bit checks. - return - range(ch, 0x10000, 0x1FFFD) || range(ch, 0x20000, 0x2FFFD) || range(ch, 0x30000, 0x3FFFD) || - range(ch, 0x40000, 0x4FFFD) || range(ch, 0x50000, 0x5FFFD) || range(ch, 0x60000, 0x6FFFD) || - range(ch, 0x70000, 0x7FFFD) || range(ch, 0x80000, 0x8FFFD) || range(ch, 0x90000, 0x9FFFD) || - range(ch, 0xA0000, 0xAFFFD) || range(ch, 0xB0000, 0xBFFFD) || range(ch, 0xC0000, 0xCFFFD) || - range(ch, 0xD0000, 0xDFFFD) || range(ch, 0xE1000, 0xEFFFD); - } - // Read a unicode escape : does not allow \\ bypass private final int readUnicodeEscape() { int ch = reader.readChar(); diff --git a/jena-arq/src/test/java/org/apache/jena/riot/tokens/TestTokenizerText.java b/jena-arq/src/test/java/org/apache/jena/riot/tokens/TestTokenizerText.java index 4bd7751067..5ada5cc3b0 100644 --- a/jena-arq/src/test/java/org/apache/jena/riot/tokens/TestTokenizerText.java +++ b/jena-arq/src/test/java/org/apache/jena/riot/tokens/TestTokenizerText.java @@ -996,64 +996,62 @@ public class TestTokenizerText { }); } - @Test(expected=RiotParseException.class) - public void token_replacmentChar_uri_1() { + public void token_replacementChar_uri_1() { Tokenizer tokenizer = tokenizer("<a\uFFFDz>"); testNextToken(tokenizer, TokenType.IRI); } - @Test(expected=RiotParseException.class) - public void token_replacmentChar_uri_2() { + public void token_replacementChar_uri_2() { Tokenizer tokenizer = tokenizer("<a\\uFFFDz>"); testNextToken(tokenizer, TokenType.IRI); } @Test(expected=RiotParseException.class) - public void token_replacmentChar_bnode_1() { + public void token_replacementChar_bnode_1() { Tokenizer tokenizer = tokenizer("ns\uFFFD:xyz"); testNextToken(tokenizer, TokenType.PREFIXED_NAME); //assertFalse(tokenizer.hasNext()); } @Test(expected=RiotParseException.class) - public void token_replacmentChar_bnode_2() { + public void token_replacementChar_bnode_2() { Tokenizer tokenizer = tokenizer("ns:\uFFFDabc"); testNextToken(tokenizer, TokenType.PREFIXED_NAME); //assertFalse(tokenizer.hasNext()); } - private final int CountWaringsOnReplacmeentChar = 0; + private static final int CountWaringsOnReplacementChar = 0; // Test for warnings @Test - public void tokenStr_replacmentChar_str_1() { - testExpectWarning("'\uFFFD'", TokenType.STRING, CountWaringsOnReplacmeentChar); + public void tokenStr_replacementChar_str_1() { + testExpectWarning("'\uFFFD'", TokenType.STRING, CountWaringsOnReplacementChar); } @Test - public void tokenStr_replacmentChar_str_2() { + public void tokenStr_replacementChar_str_2() { // As unicode escape. testExpectWarning("'\\uFFFD'", TokenType.STRING, 0); } @Test - public void tokenStr_replacmentChar_str_3() { - testExpectWarning("'''\uFFFD'''", TokenType.STRING, CountWaringsOnReplacmeentChar); + public void tokenStr_replacementChar_str_3() { + testExpectWarning("'''\uFFFD'''", TokenType.STRING, CountWaringsOnReplacementChar); } @Test - public void tokenStr_replacmentChar_str_4() { + public void tokenStr_replacementChar_str_4() { // As unicode escape. testExpectWarning("'''\\uFFFD'''", TokenType.STRING, 0); } @Test - public void tokenStr_replacmentChar_str_5() { - testExpectWarning("'abc\uFFFDdef'", TokenType.STRING, CountWaringsOnReplacmeentChar); + public void tokenStr_replacementChar_str_5() { + testExpectWarning("'abc\uFFFDdef'", TokenType.STRING, CountWaringsOnReplacementChar); } @Test - public void tokenStr_replacmentChar_str_6() { + public void tokenStr_replacementChar_str_6() { // Illegal encoding. // 0xDF is ß (lower case) in ISO-8859-1. // Here it is an illegal encoding (high set, next byte should have the high bit set but does not). @@ -1063,35 +1061,35 @@ public class TestTokenizerText { byte[] bytes = {(byte)0x22, (byte)0xDF, (byte)0x22}; Reader r = IO.asUTF8(new ByteArrayInputStream(bytes)); PeekReader pr = PeekReader.make(r); - Token t = testExpectWarning(pr, TokenType.STRING, CountWaringsOnReplacmeentChar); + Token t = testExpectWarning(pr, TokenType.STRING, CountWaringsOnReplacementChar); int char0 = t.getImage().codePointAt(0); assertEquals("Expected Unicode REPLACEMENT CHARACTER", 0xFFFD, char0); } @Test - public void tokenStr_replacmentChar_IRI_1() { - testExpectWarning("<http://example/\uFFFD>", TokenType.IRI, 1); + public void tokenStr_replacementChar_IRI_1() { + testExpectWarning("<http://example/\uFFFD>", TokenType.IRI, 0); } @Test - public void tokenStr_replacmentChar_IRI_2() { + public void tokenStr_replacementChar_IRI_2() { // As unicode escape. Still bad in a URI. - testExpectWarning("<http://example/\\uFFFD>", TokenType.IRI, 1); + testExpectWarning("<http://example/\\uFFFD>", TokenType.IRI, 0); } @Test - public void tokenStr_replacmentChar_prefixedName_1() { + public void tokenStr_replacementChar_prefixedName_1() { testExpectWarning("ex:abc\uFFFD", TokenType.PREFIXED_NAME, 1); } @Test(expected=RiotException.class) - public void tokenStr_replacmentChar_prefixedName_2() { + public void tokenStr_replacementChar_prefixedName_2() { // Unicode escape testExpectWarning("ex:abc\\uFFFD", TokenType.PREFIXED_NAME, 0); } @Test - public void tokenStr_replacmentChar_blankNode_1() { + public void tokenStr_replacementChar_blankNode_1() { testExpectWarning("_:b\uFFFD", TokenType.BNODE, 1); // and no escaped characters for blank node labels. } diff --git a/jena-core/src/main/java/org/apache/jena/irix/Chars3986.java b/jena-core/src/main/java/org/apache/jena/irix/Chars3986.java index 2d6a71adc7..fd3914cd56 100644 --- a/jena-core/src/main/java/org/apache/jena/irix/Chars3986.java +++ b/jena-core/src/main/java/org/apache/jena/irix/Chars3986.java @@ -44,7 +44,7 @@ public class Chars3986 { /** RFC3987 ipchar */ public static boolean isIPChar(char ch, String str, int posn) { - return isPChar(ch, str, posn) || isUcsChar(ch); + return isPChar(ch, str, posn) || ch_isUcsChar(ch); } /** @@ -69,7 +69,7 @@ public class Chars3986 { /** RFC3987: International alphabetic. */ public static boolean isIAlpha(char ch) { - return isAlpha(ch) || isUcsChar(ch); + return isAlpha(ch) || ch_isUcsChar(ch); } // RFC 3987 @@ -83,7 +83,13 @@ public class Chars3986 { // Surrogates are "hi-lo" : DC000-DFFF and D800-DFFF // We assume the java string is valid and surrogates are correctly in high-low pairs. + /** @deprecated Prefer {@link #int_isUcsChar(int)} */ + @Deprecated(forRemoval = true) public static boolean isUcsChar(char ch) { + return ch_isUcsChar(ch); + } + + private static boolean ch_isUcsChar(char ch) { return range(ch, 0xA0, 0xD7FF) || range(ch, 0xF900, 0xFDCF) || range(ch, 0xFDF0, 0xFFEF) // Allow surrogates. || Character.isSurrogate(ch);
