This is an automated email from the ASF dual-hosted git repository. andy pushed a commit to branch main in repository https://gitbox.apache.org/repos/asf/jena.git
commit e1ddb7fbc8e1de7d776ca1861d1bca701d7cb90a Author: Andy Seaborne <a...@apache.org> AuthorDate: Thu Jun 26 19:07:06 2025 +0100 GH-3281: Update SPARQL/ARQ to check RDF Strings --- jena-arq/Grammar/arq.jj | 4 ++-- jena-arq/Grammar/main.jj | 4 ++-- jena-arq/Grammar/sparql_12.jj | 4 ++-- .../java/org/apache/jena/sparql/lang/QueryParserBase.java | 14 +++++++++++--- .../org/apache/jena/sparql/lang/arq/javacc/ARQParser.java | 6 +++--- .../jena/sparql/lang/sparql_12/javacc/SPARQLParser12.java | 6 +++--- .../org/apache/jena/sparql/syntax/TestQueryParser.java | 2 +- 7 files changed, 24 insertions(+), 16 deletions(-) diff --git a/jena-arq/Grammar/arq.jj b/jena-arq/Grammar/arq.jj index 50d0d3c5cb..434bdb3a66 100644 --- a/jena-arq/Grammar/arq.jj +++ b/jena-arq/Grammar/arq.jj @@ -93,8 +93,8 @@ String VersionSpecifier() : { Token t ; String version ; } | t = <STRING_LITERAL2> { version = stripQuotes(t.image) ; } ) { - checkString(version, t.beginLine, t.beginColumn) ; version = unescapeStr(version, t.beginLine, t.beginColumn) ; + checkRDFString(version, t.beginLine, t.beginColumn) ; return version; } } @@ -1933,7 +1933,7 @@ String String() : { Token t ; String lex ; } | t = <STRING_LITERAL_LONG2> { lex = stripQuotes3(t.image) ; } ) { lex = unescapeStr(lex, t.beginLine, t.beginColumn) ; - checkString(lex, t.beginLine, t.beginColumn) ; + checkRDFString(lex, t.beginLine, t.beginColumn) ; return lex ; } } diff --git a/jena-arq/Grammar/main.jj b/jena-arq/Grammar/main.jj index 52448abe4a..8512dedaee 100644 --- a/jena-arq/Grammar/main.jj +++ b/jena-arq/Grammar/main.jj @@ -188,8 +188,8 @@ String VersionSpecifier() : { Token t ; String version ; } | t = <STRING_LITERAL2> { version = stripQuotes(t.image) ; } ) { - checkString(version, t.beginLine, t.beginColumn) ; version = unescapeStr(version, t.beginLine, t.beginColumn) ; + checkRDFString(version, t.beginLine, t.beginColumn) ; return version; } } @@ -2631,7 +2631,7 @@ String String() : { Token t ; String lex ; } | t = <STRING_LITERAL_LONG2> { lex = stripQuotes3(t.image) ; } ) { lex = unescapeStr(lex, t.beginLine, t.beginColumn) ; - checkString(lex, t.beginLine, t.beginColumn) ; + checkRDFString(lex, t.beginLine, t.beginColumn) ; return lex ; } } diff --git a/jena-arq/Grammar/sparql_12.jj b/jena-arq/Grammar/sparql_12.jj index 8404acc9c9..50c67ab5ec 100644 --- a/jena-arq/Grammar/sparql_12.jj +++ b/jena-arq/Grammar/sparql_12.jj @@ -83,8 +83,8 @@ String VersionSpecifier() : { Token t ; String version ; } | t = <STRING_LITERAL2> { version = stripQuotes(t.image) ; } ) { - checkString(version, t.beginLine, t.beginColumn) ; version = unescapeStr(version, t.beginLine, t.beginColumn) ; + checkRDFString(version, t.beginLine, t.beginColumn) ; return version; } } @@ -1646,7 +1646,7 @@ String String() : { Token t ; String lex ; } | t = <STRING_LITERAL_LONG2> { lex = stripQuotes3(t.image) ; } ) { lex = unescapeStr(lex, t.beginLine, t.beginColumn) ; - checkString(lex, t.beginLine, t.beginColumn) ; + checkRDFString(lex, t.beginLine, t.beginColumn) ; return lex ; } } diff --git a/jena-arq/src/main/java/org/apache/jena/sparql/lang/QueryParserBase.java b/jena-arq/src/main/java/org/apache/jena/sparql/lang/QueryParserBase.java index 0c3d81d05a..e2b4449262 100644 --- a/jena-arq/src/main/java/org/apache/jena/sparql/lang/QueryParserBase.java +++ b/jena-arq/src/main/java/org/apache/jena/sparql/lang/QueryParserBase.java @@ -212,13 +212,21 @@ public class QueryParserBase { return NodeFactory.createLiteral(lex, lang, dt); } - // Because of Java (Java strings have surrogate pairs) we only detect singleton surrogates. - protected void checkString(String string, int line, int column) { + /** + * Apply any checks for "RDF String" to a string that has already had escape processing applied. + * An RDF String is a sequence of codepoints in the range U+0000 to U+10FFFF, excluding surrogates. + * Because this is java, we test for no non-paired surrogates. + * A surrogate pair is high-low. + */ + protected static void checkRDFString(String string, int line, int column) { // Checks for bare surrogate pairs. for ( int i = 0; i < string.length(); i++ ) { // Not "codePointAt" which does surrogate processing. char ch = string.charAt(i); + if ( ! Character.isValidCodePoint(ch) ) + throw new QueryParseException(String.format("Illegal code point in \\U sequence value: 0x%08X", ch), line, column); + // Check surrogate pairs are in pairs. Pairs are high-low. if ( Character.isLowSurrogate(ch) ) throw new QueryParseException("Bad surrogate pair (low surrogate without high surrogate)", line, column); @@ -369,7 +377,7 @@ public class QueryParserBase { iriStr = stripQuotes(iriStr); iriStr = unescapeUnicode(iriStr, line, column); // Check for Unicode surrogates - checkString(iriStr, line, column); + checkRDFString(iriStr, line, column); return resolveIRI(iriStr, line, column); } diff --git a/jena-arq/src/main/java/org/apache/jena/sparql/lang/arq/javacc/ARQParser.java b/jena-arq/src/main/java/org/apache/jena/sparql/lang/arq/javacc/ARQParser.java index 7b7b044f61..3ebfd4bb56 100644 --- a/jena-arq/src/main/java/org/apache/jena/sparql/lang/arq/javacc/ARQParser.java +++ b/jena-arq/src/main/java/org/apache/jena/sparql/lang/arq/javacc/ARQParser.java @@ -149,8 +149,8 @@ version = stripQuotes(t.image) ; jj_consume_token(-1); throw new ParseException(); } -checkString(version, t.beginLine, t.beginColumn) ; - version = unescapeStr(version, t.beginLine, t.beginColumn) ; +version = unescapeStr(version, t.beginLine, t.beginColumn) ; + checkRDFString(version, t.beginLine, t.beginColumn) ; {if ("" != null) return version;} throw new Error("Missing return statement in function"); } @@ -7520,7 +7520,7 @@ lex = stripQuotes3(t.image) ; throw new ParseException(); } lex = unescapeStr(lex, t.beginLine, t.beginColumn) ; - checkString(lex, t.beginLine, t.beginColumn) ; + checkRDFString(lex, t.beginLine, t.beginColumn) ; {if ("" != null) return lex ;} throw new Error("Missing return statement in function"); } diff --git a/jena-arq/src/main/java/org/apache/jena/sparql/lang/sparql_12/javacc/SPARQLParser12.java b/jena-arq/src/main/java/org/apache/jena/sparql/lang/sparql_12/javacc/SPARQLParser12.java index 77876c05e2..8401b009c6 100644 --- a/jena-arq/src/main/java/org/apache/jena/sparql/lang/sparql_12/javacc/SPARQLParser12.java +++ b/jena-arq/src/main/java/org/apache/jena/sparql/lang/sparql_12/javacc/SPARQLParser12.java @@ -128,8 +128,8 @@ version = stripQuotes(t.image) ; jj_consume_token(-1); throw new ParseException(); } -checkString(version, t.beginLine, t.beginColumn) ; - version = unescapeStr(version, t.beginLine, t.beginColumn) ; +version = unescapeStr(version, t.beginLine, t.beginColumn) ; + checkRDFString(version, t.beginLine, t.beginColumn) ; {if ("" != null) return version;} throw new Error("Missing return statement in function"); } @@ -5889,7 +5889,7 @@ lex = stripQuotes3(t.image) ; throw new ParseException(); } lex = unescapeStr(lex, t.beginLine, t.beginColumn) ; - checkString(lex, t.beginLine, t.beginColumn) ; + checkRDFString(lex, t.beginLine, t.beginColumn) ; {if ("" != null) return lex ;} throw new Error("Missing return statement in function"); } diff --git a/jena-arq/src/test/java/org/apache/jena/sparql/syntax/TestQueryParser.java b/jena-arq/src/test/java/org/apache/jena/sparql/syntax/TestQueryParser.java index 6f447f9d61..acae80ff6e 100644 --- a/jena-arq/src/test/java/org/apache/jena/sparql/syntax/TestQueryParser.java +++ b/jena-arq/src/test/java/org/apache/jena/sparql/syntax/TestQueryParser.java @@ -70,7 +70,7 @@ public class TestQueryParser { @Test public void syntax_unicode_surrogate_pair_by_unicode_escape() { - // Allow - because Java strings may have surrogate pairs so we allow then in unicode escapes if paired. + // Allow - because Java strings may have surrogate pairs so we allow them in unicode escapes if paired. testParse("SELECT * { ?s ?p '\\uD801\\uDC37'}"); // QueryParseException ex = assertThrows(QueryParseException.class, ()->testParse("SELECT * { ?s ?p '\\uD801\\uDC37'}"));