This is an automated email from the ASF dual-hosted git repository.

andy pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/jena.git

commit e1ddb7fbc8e1de7d776ca1861d1bca701d7cb90a
Author: Andy Seaborne <a...@apache.org>
AuthorDate: Thu Jun 26 19:07:06 2025 +0100

    GH-3281: Update SPARQL/ARQ to check RDF Strings
---
 jena-arq/Grammar/arq.jj                                    |  4 ++--
 jena-arq/Grammar/main.jj                                   |  4 ++--
 jena-arq/Grammar/sparql_12.jj                              |  4 ++--
 .../java/org/apache/jena/sparql/lang/QueryParserBase.java  | 14 +++++++++++---
 .../org/apache/jena/sparql/lang/arq/javacc/ARQParser.java  |  6 +++---
 .../jena/sparql/lang/sparql_12/javacc/SPARQLParser12.java  |  6 +++---
 .../org/apache/jena/sparql/syntax/TestQueryParser.java     |  2 +-
 7 files changed, 24 insertions(+), 16 deletions(-)

diff --git a/jena-arq/Grammar/arq.jj b/jena-arq/Grammar/arq.jj
index 50d0d3c5cb..434bdb3a66 100644
--- a/jena-arq/Grammar/arq.jj
+++ b/jena-arq/Grammar/arq.jj
@@ -93,8 +93,8 @@ String VersionSpecifier() : { Token t ; String version ; }
     | t = <STRING_LITERAL2> { version = stripQuotes(t.image) ; }
     )
     {
-      checkString(version, t.beginLine, t.beginColumn) ;
       version = unescapeStr(version, t.beginLine, t.beginColumn) ;
+      checkRDFString(version, t.beginLine, t.beginColumn) ;
       return version;
     }
 }
@@ -1933,7 +1933,7 @@ String String() : { Token t ; String lex ; }
   | t = <STRING_LITERAL_LONG2> { lex = stripQuotes3(t.image) ; }
   )
     { lex = unescapeStr(lex, t.beginLine, t.beginColumn) ;
-      checkString(lex, t.beginLine, t.beginColumn) ;
+      checkRDFString(lex, t.beginLine, t.beginColumn) ;
       return lex ;
     }
 }
diff --git a/jena-arq/Grammar/main.jj b/jena-arq/Grammar/main.jj
index 52448abe4a..8512dedaee 100644
--- a/jena-arq/Grammar/main.jj
+++ b/jena-arq/Grammar/main.jj
@@ -188,8 +188,8 @@ String VersionSpecifier() : { Token t ; String version ; }
     | t = <STRING_LITERAL2> { version = stripQuotes(t.image) ; }
     )
     {
-      checkString(version, t.beginLine, t.beginColumn) ;
       version = unescapeStr(version,  t.beginLine, t.beginColumn) ;
+      checkRDFString(version,  t.beginLine, t.beginColumn) ;
       return version;
     }
 }
@@ -2631,7 +2631,7 @@ String String() : { Token t ; String lex ; }
   | t = <STRING_LITERAL_LONG2> { lex = stripQuotes3(t.image) ; }
   )
     { lex = unescapeStr(lex, t.beginLine, t.beginColumn) ;
-      checkString(lex, t.beginLine, t.beginColumn) ;
+      checkRDFString(lex,  t.beginLine, t.beginColumn) ;
       return lex ;
     }
 }
diff --git a/jena-arq/Grammar/sparql_12.jj b/jena-arq/Grammar/sparql_12.jj
index 8404acc9c9..50c67ab5ec 100644
--- a/jena-arq/Grammar/sparql_12.jj
+++ b/jena-arq/Grammar/sparql_12.jj
@@ -83,8 +83,8 @@ String VersionSpecifier() : { Token t ; String version ; }
     | t = <STRING_LITERAL2> { version = stripQuotes(t.image) ; }
     )
     {
-      checkString(version, t.beginLine, t.beginColumn) ;
       version = unescapeStr(version, t.beginLine, t.beginColumn) ;
+      checkRDFString(version, t.beginLine, t.beginColumn) ;
       return version;
     }
 }
@@ -1646,7 +1646,7 @@ String String() : { Token t ; String lex ; }
   | t = <STRING_LITERAL_LONG2> { lex = stripQuotes3(t.image) ; }
   )
     { lex = unescapeStr(lex, t.beginLine, t.beginColumn) ;
-      checkString(lex, t.beginLine, t.beginColumn) ;
+      checkRDFString(lex, t.beginLine, t.beginColumn) ;
       return lex ;
     }
 }
diff --git 
a/jena-arq/src/main/java/org/apache/jena/sparql/lang/QueryParserBase.java 
b/jena-arq/src/main/java/org/apache/jena/sparql/lang/QueryParserBase.java
index 0c3d81d05a..e2b4449262 100644
--- a/jena-arq/src/main/java/org/apache/jena/sparql/lang/QueryParserBase.java
+++ b/jena-arq/src/main/java/org/apache/jena/sparql/lang/QueryParserBase.java
@@ -212,13 +212,21 @@ public class QueryParserBase {
         return NodeFactory.createLiteral(lex, lang, dt);
     }
 
-    // Because of Java (Java strings have surrogate pairs) we only detect 
singleton surrogates.
-    protected void checkString(String string, int line, int column) {
+    /**
+     * Apply any checks for "RDF String" to a string that has already had 
escape processing applied.
+     * An RDF String is a sequence of codepoints in the range U+0000 to 
U+10FFFF, excluding surrogates.
+     * Because this is java, we test for no non-paired surrogates.
+     * A surrogate pair is high-low.
+     */
+    protected static void checkRDFString(String string, int line, int column) {
         // Checks for bare surrogate pairs.
         for ( int i = 0; i < string.length(); i++ ) {
             // Not "codePointAt" which does surrogate processing.
             char ch = string.charAt(i);
 
+            if ( ! Character.isValidCodePoint(ch) )
+                throw new QueryParseException(String.format("Illegal code 
point in \\U sequence value: 0x%08X", ch), line, column);
+
             // Check surrogate pairs are in pairs. Pairs are high-low.
             if ( Character.isLowSurrogate(ch) )
                 throw new QueryParseException("Bad surrogate pair (low 
surrogate without high surrogate)", line, column);
@@ -369,7 +377,7 @@ public class QueryParserBase {
         iriStr = stripQuotes(iriStr);
         iriStr = unescapeUnicode(iriStr, line, column);
         // Check for Unicode surrogates
-        checkString(iriStr, line, column);
+        checkRDFString(iriStr, line, column);
         return resolveIRI(iriStr, line, column);
     }
 
diff --git 
a/jena-arq/src/main/java/org/apache/jena/sparql/lang/arq/javacc/ARQParser.java 
b/jena-arq/src/main/java/org/apache/jena/sparql/lang/arq/javacc/ARQParser.java
index 7b7b044f61..3ebfd4bb56 100644
--- 
a/jena-arq/src/main/java/org/apache/jena/sparql/lang/arq/javacc/ARQParser.java
+++ 
b/jena-arq/src/main/java/org/apache/jena/sparql/lang/arq/javacc/ARQParser.java
@@ -149,8 +149,8 @@ version = stripQuotes(t.image) ;
       jj_consume_token(-1);
       throw new ParseException();
     }
-checkString(version, t.beginLine, t.beginColumn) ;
-      version = unescapeStr(version, t.beginLine, t.beginColumn) ;
+version = unescapeStr(version, t.beginLine, t.beginColumn) ;
+      checkRDFString(version, t.beginLine, t.beginColumn) ;
       {if ("" != null) return version;}
     throw new Error("Missing return statement in function");
 }
@@ -7520,7 +7520,7 @@ lex = stripQuotes3(t.image) ;
       throw new ParseException();
     }
 lex = unescapeStr(lex, t.beginLine, t.beginColumn) ;
-      checkString(lex, t.beginLine, t.beginColumn) ;
+      checkRDFString(lex, t.beginLine, t.beginColumn) ;
       {if ("" != null) return lex ;}
     throw new Error("Missing return statement in function");
 }
diff --git 
a/jena-arq/src/main/java/org/apache/jena/sparql/lang/sparql_12/javacc/SPARQLParser12.java
 
b/jena-arq/src/main/java/org/apache/jena/sparql/lang/sparql_12/javacc/SPARQLParser12.java
index 77876c05e2..8401b009c6 100644
--- 
a/jena-arq/src/main/java/org/apache/jena/sparql/lang/sparql_12/javacc/SPARQLParser12.java
+++ 
b/jena-arq/src/main/java/org/apache/jena/sparql/lang/sparql_12/javacc/SPARQLParser12.java
@@ -128,8 +128,8 @@ version = stripQuotes(t.image) ;
       jj_consume_token(-1);
       throw new ParseException();
     }
-checkString(version, t.beginLine, t.beginColumn) ;
-      version = unescapeStr(version, t.beginLine, t.beginColumn) ;
+version = unescapeStr(version, t.beginLine, t.beginColumn) ;
+      checkRDFString(version, t.beginLine, t.beginColumn) ;
       {if ("" != null) return version;}
     throw new Error("Missing return statement in function");
 }
@@ -5889,7 +5889,7 @@ lex = stripQuotes3(t.image) ;
       throw new ParseException();
     }
 lex = unescapeStr(lex, t.beginLine, t.beginColumn) ;
-      checkString(lex, t.beginLine, t.beginColumn) ;
+      checkRDFString(lex, t.beginLine, t.beginColumn) ;
       {if ("" != null) return lex ;}
     throw new Error("Missing return statement in function");
 }
diff --git 
a/jena-arq/src/test/java/org/apache/jena/sparql/syntax/TestQueryParser.java 
b/jena-arq/src/test/java/org/apache/jena/sparql/syntax/TestQueryParser.java
index 6f447f9d61..acae80ff6e 100644
--- a/jena-arq/src/test/java/org/apache/jena/sparql/syntax/TestQueryParser.java
+++ b/jena-arq/src/test/java/org/apache/jena/sparql/syntax/TestQueryParser.java
@@ -70,7 +70,7 @@ public class TestQueryParser {
 
     @Test
     public void syntax_unicode_surrogate_pair_by_unicode_escape() {
-        // Allow - because Java strings may have surrogate pairs so we allow 
then in unicode escapes if paired.
+        // Allow - because Java strings may have surrogate pairs so we allow 
them in unicode escapes if paired.
         testParse("SELECT * { ?s ?p '\\uD801\\uDC37'}");
 
 //        QueryParseException ex = assertThrows(QueryParseException.class,  
()->testParse("SELECT * { ?s ?p '\\uD801\\uDC37'}"));

Reply via email to