This is an automated email from the ASF dual-hosted git repository. andy pushed a commit to branch main in repository https://gitbox.apache.org/repos/asf/jena.git
commit bea4aec856778986b49a2b74842eefcab3af5984 Author: Andy Seaborne <[email protected]> AuthorDate: Thu Feb 13 22:00:51 2025 +0000 GH-2979: Test for surrogates after escape processing --- jena-arq/Grammar/Final/README.txt | 15 ++++++++ jena-arq/Grammar/arq.jj | 4 +-- jena-arq/Grammar/main.jj | 3 +- jena-arq/Grammar/sparql_12.jj | 4 +-- .../apache/jena/sparql/lang/QueryParserBase.java | 25 +++++++------ .../org/apache/jena/sparql/lang/arq/ARQParser.java | 4 +-- .../jena/sparql/lang/sparql_12/SPARQLParser12.java | 4 +-- .../apache/jena/sparql/syntax/TestQueryParser.java | 42 ++++++++++++++++++++++ jena-cmds/src/test/java/arq/rdftests.java | 2 ++ 9 files changed, 83 insertions(+), 20 deletions(-) diff --git a/jena-arq/Grammar/Final/README.txt b/jena-arq/Grammar/Final/README.txt index 5308eb58ed..9d653ad57f 100644 --- a/jena-arq/Grammar/Final/README.txt +++ b/jena-arq/Grammar/Final/README.txt @@ -1 +1,16 @@ Final-for-spec-publication versions of the grammar. + +sparql_10-final.jj - SPARQL 1.0 "sparql_10.jj" ("main.jj" after cpp) + +sparql_11-final.jj - SPARQL 1.1 "sparql_11.jj" ("main.jj" after cpp) + +sparql-main-11.jj - SPARQL 1.1 "main.jj" (com.hp) + + +sparql_11-dev-final.jj - End SPARQL 1.1 development. (org.apache.jena.graph "main.jj" at SPARQLParser11) +sparql_11-dev-final.txt - jjdoc +tokens_11.txt - Tokens file. + +-- Coming soon. +sparql_12-final.jj - SPARQL 1.2 "sparql_11.jj" ("main.jj" after cpp) +sparql-main-12.jj - SPARQL 1.2 "main.jj" diff --git a/jena-arq/Grammar/arq.jj b/jena-arq/Grammar/arq.jj index fcab512796..cc471f1003 100644 --- a/jena-arq/Grammar/arq.jj +++ b/jena-arq/Grammar/arq.jj @@ -1910,8 +1910,8 @@ String String() : { Token t ; String lex ; } | t = <STRING_LITERAL_LONG1> { lex = stripQuotes3(t.image) ; } | t = <STRING_LITERAL_LONG2> { lex = stripQuotes3(t.image) ; } ) - { checkString(lex, t.beginLine, t.beginColumn) ; - lex = unescapeStr(lex, t.beginLine, t.beginColumn) ; + { lex = unescapeStr(lex, t.beginLine, t.beginColumn) ; + checkString(lex, t.beginLine, t.beginColumn) ; return lex ; } } diff --git a/jena-arq/Grammar/main.jj b/jena-arq/Grammar/main.jj index 19e2e62285..184dc198c8 100644 --- a/jena-arq/Grammar/main.jj +++ b/jena-arq/Grammar/main.jj @@ -2620,8 +2620,7 @@ String String() : { Token t ; String lex ; } | t = <STRING_LITERAL_LONG1> { lex = stripQuotes3(t.image) ; } | t = <STRING_LITERAL_LONG2> { lex = stripQuotes3(t.image) ; } ) - { checkString(lex, t.beginLine, t.beginColumn) ; - lex = unescapeStr(lex, t.beginLine, t.beginColumn) ; + { lex = prepareLexicalForm(lex, t.beginLine, t.beginColumn) ; return lex ; } } diff --git a/jena-arq/Grammar/sparql_12.jj b/jena-arq/Grammar/sparql_12.jj index 8154839f00..ba9e68a2dd 100644 --- a/jena-arq/Grammar/sparql_12.jj +++ b/jena-arq/Grammar/sparql_12.jj @@ -1625,8 +1625,8 @@ String String() : { Token t ; String lex ; } | t = <STRING_LITERAL_LONG1> { lex = stripQuotes3(t.image) ; } | t = <STRING_LITERAL_LONG2> { lex = stripQuotes3(t.image) ; } ) - { checkString(lex, t.beginLine, t.beginColumn) ; - lex = unescapeStr(lex, t.beginLine, t.beginColumn) ; + { lex = unescapeStr(lex, t.beginLine, t.beginColumn) ; + checkString(lex, t.beginLine, t.beginColumn) ; return lex ; } } diff --git a/jena-arq/src/main/java/org/apache/jena/sparql/lang/QueryParserBase.java b/jena-arq/src/main/java/org/apache/jena/sparql/lang/QueryParserBase.java index 006c9f8fda..d580837f6a 100644 --- a/jena-arq/src/main/java/org/apache/jena/sparql/lang/QueryParserBase.java +++ b/jena-arq/src/main/java/org/apache/jena/sparql/lang/QueryParserBase.java @@ -208,11 +208,17 @@ public class QueryParserBase { return NodeFactory.createLiteral(lex, lang, dt); } + // Because of Java (Java strings have surrogate pairs) we only detect singleton surrogates. protected void checkString(String string, int line, int column) { + // Checks for bare surrogate pairs. for ( int i = 0 ; i < string.length() ; i++ ) { // Not "codePointAt" which does surrogate processing. char ch = string.charAt(i); - // Check surrogate pairs are pairs. + + // Check surrogate pairs are in pairs. Pairs are high-low. + if ( Character.isLowSurrogate(ch) ) + throw new QueryParseException("Bad surrogate pair (low surrogate without high surrogate)", line, column); + if ( Character.isHighSurrogate(ch) ) { i++; if ( i == string.length() ) @@ -221,8 +227,6 @@ public class QueryParserBase { if ( !Character.isLowSurrogate(ch1) ) { throw new QueryParseException("Bad surrogate pair (high surrogate not followed by low surrogate)", line, column); } - } else if ( Character.isLowSurrogate(ch) ) { - throw new QueryParseException("Bad surrogate pair (low surrogate without high surrogate)", line, column); } } } @@ -345,7 +349,6 @@ public class QueryParserBase { protected Var createVariable(String s, int line, int column) { s = s.substring(1); // Drop the marker - // This is done by the parser input stream nowadays. // s = unescapeCodePoint(s, line, column); // Check \ u did not put in any illegals. @@ -361,6 +364,8 @@ public class QueryParserBase { protected String resolveQuotedIRI(String iriStr, int line, int column) { iriStr = stripQuotes(iriStr); iriStr = unescapeUnicode(iriStr, line, column); + // Check for Unicode surrogates + checkString(iriStr, line, column); return resolveIRI(iriStr, line, column); } @@ -634,12 +639,16 @@ public class QueryParserBase { // { return unescape(s, '\\', true, line, column); } // Do we need the line/column versions? - // Why not catch exceptions and comvert to QueryParseException - + // Why not catch exceptions and convert to QueryParseException protected static String unescapeStr(String s, int line, int column) { return unescape(s, '\\', false, line, column); } + /** Unescape unicode - no surrogate processing. */ + protected static String unescapeUnicode(String s, int line, int column) { + return unescape(s, '\\', true, line, column); + } + // Worker function protected static String unescape(String s, char escape, boolean pointCodeOnly, int line, int column) { try { @@ -650,10 +659,6 @@ public class QueryParserBase { } } - protected static String unescapeUnicode(String s, int line, int column) { - return unescape(s, '\\', true, line, column); - } - protected static String unescapePName(String s, int line, int column) { char escape = '\\'; int idx = s.indexOf(escape); diff --git a/jena-arq/src/main/java/org/apache/jena/sparql/lang/arq/ARQParser.java b/jena-arq/src/main/java/org/apache/jena/sparql/lang/arq/ARQParser.java index 53fce46895..5640f4ba91 100644 --- a/jena-arq/src/main/java/org/apache/jena/sparql/lang/arq/ARQParser.java +++ b/jena-arq/src/main/java/org/apache/jena/sparql/lang/arq/ARQParser.java @@ -7442,8 +7442,8 @@ lex = stripQuotes3(t.image) ; jj_consume_token(-1); throw new ParseException(); } -checkString(lex, t.beginLine, t.beginColumn) ; - lex = unescapeStr(lex, t.beginLine, t.beginColumn) ; +lex = unescapeStr(lex, t.beginLine, t.beginColumn) ; + checkString(lex, t.beginLine, t.beginColumn) ; {if ("" != null) return lex ;} throw new Error("Missing return statement in function"); } diff --git a/jena-arq/src/main/java/org/apache/jena/sparql/lang/sparql_12/SPARQLParser12.java b/jena-arq/src/main/java/org/apache/jena/sparql/lang/sparql_12/SPARQLParser12.java index 5bc894bc10..64270c70cd 100644 --- a/jena-arq/src/main/java/org/apache/jena/sparql/lang/sparql_12/SPARQLParser12.java +++ b/jena-arq/src/main/java/org/apache/jena/sparql/lang/sparql_12/SPARQLParser12.java @@ -5837,8 +5837,8 @@ lex = stripQuotes3(t.image) ; jj_consume_token(-1); throw new ParseException(); } -checkString(lex, t.beginLine, t.beginColumn) ; - lex = unescapeStr(lex, t.beginLine, t.beginColumn) ; +lex = unescapeStr(lex, t.beginLine, t.beginColumn) ; + checkString(lex, t.beginLine, t.beginColumn) ; {if ("" != null) return lex ;} throw new Error("Missing return statement in function"); } diff --git a/jena-arq/src/test/java/org/apache/jena/sparql/syntax/TestQueryParser.java b/jena-arq/src/test/java/org/apache/jena/sparql/syntax/TestQueryParser.java index 4b93c8dcca..1f81373deb 100644 --- a/jena-arq/src/test/java/org/apache/jena/sparql/syntax/TestQueryParser.java +++ b/jena-arq/src/test/java/org/apache/jena/sparql/syntax/TestQueryParser.java @@ -19,12 +19,14 @@ package org.apache.jena.sparql.syntax; import static org.junit.jupiter.api.Assertions.assertThrows; +import static org.junit.jupiter.api.Assertions.assertTrue; import org.junit.jupiter.api.Test; import org.apache.jena.atlas.logging.LogCtl; import org.apache.jena.query.QueryFactory; import org.apache.jena.query.QueryParseException; +import static org.apache.jena.query.Syntax.*; import org.apache.jena.sparql.lang.QueryParserBase; import org.slf4j.Logger; @@ -40,6 +42,46 @@ public class TestQueryParser { LogCtl.withLevel(loggerSPARQL, "fatal", action); } + // Single backslash so a Java string escape, raw surrogate in the string. + @Test + public void syntax_unicode_raw_surrogate_uri() { + QueryParseException ex = assertThrows(QueryParseException.class, ()->testParse("SELECT * { <http://example/\uD800> ?p ?o}")); + assertTrue(ex.getMessage().contains("surrogate")); + } + + @Test + public void syntax_unicode_raw_surrogate_string() { + QueryParseException ex = assertThrows(QueryParseException.class, ()->testParse("SELECT * { ?s ?p '\uD800' }")); + assertTrue(ex.getMessage().contains("surrogate")); + } + + // Double backslash so the query string has an escape in it. + @Test + public void syntax_unicode_escaped_surrogate_uri() { + QueryParseException ex = assertThrows(QueryParseException.class, ()->testParse("SELECT * { <http://example/\\uD800> ?p ?o}")); + assertTrue(ex.getMessage().contains("surrogate")); + } + + @Test + public void syntax_unicode_escaped_surrogate_strings() { + QueryParseException ex = assertThrows(QueryParseException.class, ()->testParse("SELECT * { ?s ?p '\\uD800'}")); + assertTrue(ex.getMessage().contains("surrogate")); + } + + @Test + public void syntax_unicode_surrogate_pair_by_unicode_escape() { + // Allow - because Java strings may have surrogate pairs so we allow then in unicode escapes if paired. + testParse("SELECT * { ?s ?p '\\uD801\\uDC37'}"); + +// QueryParseException ex = assertThrows(QueryParseException.class, ()->testParse("SELECT * { ?s ?p '\\uD801\\uDC37'}")); +// assertTrue(ex.getMessage().contains("surrogate")); + } + + private static void testParse(String string) { + QueryFactory.create(string, syntaxSPARQL_12); + QueryFactory.create(string, syntaxARQ); + } + @Test public void syntax_uri_brackets_1() { testParseIRIs("<http://example/#[]>"); } diff --git a/jena-cmds/src/test/java/arq/rdftests.java b/jena-cmds/src/test/java/arq/rdftests.java index 8655942732..70f2246445 100644 --- a/jena-cmds/src/test/java/arq/rdftests.java +++ b/jena-cmds/src/test/java/arq/rdftests.java @@ -81,6 +81,8 @@ public class rdftests extends CmdGeneral RIOT.getContext().set(RIOT.symTurtleDirectiveStyle, "sparql"); } + // Test runners are in jena-arq, package org.apache.jena.arq.junit.runners + public static void main(String...argv) { try { new rdftests(argv).mainRun(); } catch (TerminationException ex) {
