This is an automated email from the ASF dual-hosted git repository.
andy pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/jena.git
The following commit(s) were added to refs/heads/main by this push:
new 8fac5edc0e GH-2826: Disallow raw CR in single quoted string
8fac5edc0e is described below
commit 8fac5edc0e7885c68891b14cf1156057ebda5226
Author: Andy Seaborne <[email protected]>
AuthorDate: Wed Nov 6 15:45:21 2024 +0000
GH-2826: Disallow raw CR in single quoted string
---
.../org/apache/jena/riot/tokens/TokenizerText.java | 25 ++++++----
.../apache/jena/riot/tokens/TestTokenizerText.java | 53 ++++++++++++++++++++++
.../main/java/org/apache/jena/atlas/lib/Chars.java | 26 +++++++----
3 files changed, 84 insertions(+), 20 deletions(-)
diff --git
a/jena-arq/src/main/java/org/apache/jena/riot/tokens/TokenizerText.java
b/jena-arq/src/main/java/org/apache/jena/riot/tokens/TokenizerText.java
index bc7de737c2..0f046bd583 100644
--- a/jena-arq/src/main/java/org/apache/jena/riot/tokens/TokenizerText.java
+++ b/jena-arq/src/main/java/org/apache/jena/riot/tokens/TokenizerText.java
@@ -215,7 +215,7 @@ public final class TokenizerText implements Tokenizer
int ch3 = reader.peekChar();
if ( ch3 == ch ) {
reader.readChar(); // Read potential third quote.
- token.setImage(readLongString(ch, false));
+ token.setImage(readStringQuote3(ch, false));
StringType st = (ch == CH_QUOTE1) ?
StringType.LONG_STRING1 : StringType.LONG_STRING2;
token.setStringType(st);
} else {
@@ -232,7 +232,7 @@ public final class TokenizerText implements Tokenizer
}
} else {
// One quote character.
- token.setImage(readString(ch, ch));
+ token.setImage(readStringQuote1(ch, ch));
// Record exactly what form of STRING was seen.
StringType st = (ch == CH_QUOTE1) ? StringType.STRING1 :
StringType.STRING2;
token.setStringType(st);
@@ -788,7 +788,7 @@ public final class TokenizerText implements Tokenizer
// Get characters between two markers.
// strEscapes may be processed
- private String readString(int startCh, int endCh) {
+ private String readStringQuote1(int startCh, int endCh) {
// Position at start of string.
stringBuilder.setLength(0);
// Assumes first delimiter char read already.
@@ -802,15 +802,20 @@ public final class TokenizerText implements Tokenizer
warning("Unicode replacement character U+FFFD in string");
}
if ( ch == NotACharacter || ch == ReverseOrderBOM )
- warning("Unicode non-character U+%4X in string", ch);
- if ( ch == EOF ) {
- // if ( endNL ) return stringBuilder.toString();
+ warning("Unicode non-character U+%04X in string", ch);
+ if ( ch == EOF )
fatal("Broken token: %s", stringBuilder.toString());
- }
- else if ( ch == NL )
- fatal("Broken token (newline): %s", stringBuilder.toString());
else if ( ch == endCh )
return stringBuilder.toString();
+ else if ( ch == NL )
+ fatal("Broken token (newline in string)",
stringBuilder.toString());
+ else if ( ch == CR )
+ fatal("Broken token (carriage return in string)",
stringBuilder.toString());
+ // Legal in Turtle/N-Triples - maybe warn?
+// else if ( ch == FF )
+// warning("Bad token (form feed in string)",
stringBuilder.toString());
+// else if ( ch == VT )
+// fatal("Bad token (vertical tab in string)",
stringBuilder.toString());
else if ( ch == CH_RSLASH )
// Allow escaped replacement character.
ch = readLiteralEscape();
@@ -818,7 +823,7 @@ public final class TokenizerText implements Tokenizer
}
}
- private String readLongString(int quoteChar, boolean endNL) {
+ private String readStringQuote3(int quoteChar, boolean endNL) {
stringBuilder.setLength(0);
for (;;) {
int ch = reader.readChar();
diff --git
a/jena-arq/src/test/java/org/apache/jena/riot/tokens/TestTokenizerText.java
b/jena-arq/src/test/java/org/apache/jena/riot/tokens/TestTokenizerText.java
index 5ada5cc3b0..b4c2865afb 100644
--- a/jena-arq/src/test/java/org/apache/jena/riot/tokens/TestTokenizerText.java
+++ b/jena-arq/src/test/java/org/apache/jena/riot/tokens/TestTokenizerText.java
@@ -313,6 +313,38 @@ public class TestTokenizerText {
tokenizeAndTestExact("'\\U00000020'", StringType.STRING1, " ");
}
+ // Raw newline and carriage return not allowed in single quoted strings
+
+ @Test(expected = RiotParseException.class)
+ public void tokenUnit_str12() {
+ tokenFirst("'abc\rdef'");
+ }
+
+ @Test(expected = RiotParseException.class)
+ public void tokenUnit_str13() {
+ tokenFirst("'abc\ndef'");
+ }
+
+ @Test(expected = RiotParseException.class)
+ public void tokenUnit_str14() {
+ tokenFirst("\"abc\rdef\"");
+ }
+
+ @Test(expected = RiotParseException.class)
+ public void tokenUnit_str15() {
+ tokenFirst("\"abc\ndef\"");
+ }
+
+ @Test(expected = RiotParseException.class)
+ public void tokenUnit_str16() {
+ tokenFirst("'\r'");
+ }
+
+ @Test(expected = RiotParseException.class)
+ public void tokenUnit_str17() {
+ tokenFirst("\"\n\"");
+ }
+
@Test
public void tokenUnit_str_long1() {
tokenizeAndTestExact("'''aaa'''", StringType.LONG_STRING1, "aaa");
@@ -368,6 +400,27 @@ public class TestTokenizerText {
tokenFirst("'''");
}
+ @Test
+ public void tokenUnit_str_long20() {
+ tokenizeAndTestExact("\"\"\"abc\ndef\"\"\"", StringType.LONG_STRING2,
"abc\ndef");
+ }
+
+ @Test
+ public void tokenUnit_str_long21() {
+ tokenizeAndTestExact("\"\"\"abc\rdef\"\"\"", StringType.LONG_STRING2,
"abc\rdef");
+ }
+
+ @Test
+ public void tokenUnit_str_long22() {
+ tokenizeAndTestExact("'''abc\ndef'''", StringType.LONG_STRING1,
"abc\ndef");
+ }
+
+ @Test
+ public void tokenUnit_str_long23() {
+ tokenizeAndTestExact("'''abc\rdef'''", StringType.LONG_STRING1,
"abc\rdef");
+ }
+
+
@Test
public void tokenUnit_str_long12() {
tokenizeAndTestExact("'''x'''@en", TokenType.LITERAL_LANG, "x", "en");
diff --git a/jena-base/src/main/java/org/apache/jena/atlas/lib/Chars.java
b/jena-base/src/main/java/org/apache/jena/atlas/lib/Chars.java
index e9db62c855..fde9bb8d11 100644
--- a/jena-base/src/main/java/org/apache/jena/atlas/lib/Chars.java
+++ b/jena-base/src/main/java/org/apache/jena/atlas/lib/Chars.java
@@ -177,20 +177,22 @@ public class Chars
// REPLACEMENT CHARACTER
public static final char REPLACEMENT = 0xFFFD ;
- // "Not a character" - a Unicode noncharacter codepoint that is not legal
in UTF-8.
+ // "Not a character" - a Unicode non-character codepoint that is not legal
in UTF-8.
public static final char NotACharacter = 0xFFFF ;
- // Detect byte order by contrast (BOM reversed) - a Unicode noncharacter
codepoint that is not legal in UTF-8.
+ // Detect byte order by contrast (BOM reversed) - a Unicode non-character
codepoint that is not legal in UTF-8.
public static final char ReverseOrderBOM = 0xFFFE ;
/** Undefined character (exact meaning depends on use) - not a Unicode
codepoint */
public static final int UNSET = -2 ;
- public static final char NL = '\n' ;
- public static final char LF = NL ; // Alt name.
- public static final char CR = '\r' ;
- public static final char TAB = '\t' ;
- public static final char FF = '\f' ; // Form feed
- public static final char SPC = ' ' ;
- public static final char BSPACE = '\b' ;
+ public static final char NL = '\n' ; // U+000A
+ public static final char LF = NL ; // U+000A -
alternative name
+ public static final char CR = '\r' ; // U+000D
+ public static final char TAB = '\t' ; // U+0009
+ public static final char FF = '\f' ; // U+000B - Form
feed
+ public static final char VT = '\u000B' ; // U+000C -
Vertical tab
+ public static final char SPC = ' ' ; // U+0020
+ public static final char BSPACE = '\b' ; // U+0008 -
Backspace
+ public static final char DEL = '\u007F' ; // U+0008 - Delete
public static final char CH_ZERO = (char)0 ;
@@ -231,9 +233,13 @@ public class Chars
public static final char CH_VBAR = '|' ;
public static final char CH_TILDE = '~' ;
- // Byte versions of the above
+ // Byte versions of some of the above
public static final byte B_NL = NL ;
+ public static final byte B_LF = LF ;
public static final byte B_CR = CR ;
+ public static final byte B_FF = FF ;
+ public static final byte B_VT = VT ;
+ public static final byte B_SPC = SPC ;
public static final byte B_LBRACKET = '[' ;
public static final byte B_RBRACKET = ']' ;