Repository: jena
Updated Branches:
refs/heads/master a5d3d915f -> 0cac294ec
JENA-911 : Strict RDF 1.1 URI tokens (no space, {, } ... in URI)
Project: http://git-wip-us.apache.org/repos/asf/jena/repo
Commit: http://git-wip-us.apache.org/repos/asf/jena/commit/0cac294e
Tree: http://git-wip-us.apache.org/repos/asf/jena/tree/0cac294e
Diff: http://git-wip-us.apache.org/repos/asf/jena/diff/0cac294e
Branch: refs/heads/master
Commit: 0cac294ece3aba6ff73e9d960499bf968bdf9ab4
Parents: a5d3d91
Author: Andy Seaborne <[email protected]>
Authored: Mon Apr 27 19:36:28 2015 +0100
Committer: Andy Seaborne <[email protected]>
Committed: Mon Apr 27 19:36:28 2015 +0100
----------------------------------------------------------------------
.../java/org/apache/jena/atlas/lib/Chars.java | 2 +
.../org/apache/jena/riot/tokens/Tokenizer.java | 1 -
.../jena/riot/tokens/TokenizerFactory.java | 40 +++++++-------
.../apache/jena/riot/tokens/TokenizerText.java | 56 ++++++++++----------
.../apache/jena/riot/lang/TestLangNTuples.java | 16 +++---
.../org/apache/jena/riot/lang/TestLangTrig.java | 10 ++--
.../apache/jena/riot/lang/TestLangTurtle.java | 33 +++++-------
.../apache/jena/riot/tokens/TestTokenizer.java | 47 ++++++++++++++++
8 files changed, 125 insertions(+), 80 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/jena/blob/0cac294e/jena-arq/src/main/java/org/apache/jena/atlas/lib/Chars.java
----------------------------------------------------------------------
diff --git a/jena-arq/src/main/java/org/apache/jena/atlas/lib/Chars.java
b/jena-arq/src/main/java/org/apache/jena/atlas/lib/Chars.java
index ba301f8..38f4f88 100644
--- a/jena-arq/src/main/java/org/apache/jena/atlas/lib/Chars.java
+++ b/jena-arq/src/main/java/org/apache/jena/atlas/lib/Chars.java
@@ -183,6 +183,8 @@ public class Chars
public static final int UNSET = -2 ;
public static final char NL = '\n' ;
public static final char CR = '\r' ;
+ public static final char TAB = '\t' ;
+ public static final char SPC = ' ' ;
public static final char BSPACE = '\b' ;
public static final char CH_ZERO = (char)0 ;
http://git-wip-us.apache.org/repos/asf/jena/blob/0cac294e/jena-arq/src/main/java/org/apache/jena/riot/tokens/Tokenizer.java
----------------------------------------------------------------------
diff --git a/jena-arq/src/main/java/org/apache/jena/riot/tokens/Tokenizer.java
b/jena-arq/src/main/java/org/apache/jena/riot/tokens/Tokenizer.java
index b5ffa00..b1cd5e6 100644
--- a/jena-arq/src/main/java/org/apache/jena/riot/tokens/Tokenizer.java
+++ b/jena-arq/src/main/java/org/apache/jena/riot/tokens/Tokenizer.java
@@ -22,7 +22,6 @@ import java.util.Iterator ;
import org.apache.jena.atlas.lib.Closeable ;
-
public interface Tokenizer extends Iterator<Token>, Closeable
{
/** Is there another token? */
http://git-wip-us.apache.org/repos/asf/jena/blob/0cac294e/jena-arq/src/main/java/org/apache/jena/riot/tokens/TokenizerFactory.java
----------------------------------------------------------------------
diff --git
a/jena-arq/src/main/java/org/apache/jena/riot/tokens/TokenizerFactory.java
b/jena-arq/src/main/java/org/apache/jena/riot/tokens/TokenizerFactory.java
index 3571c84..886bdbf 100644
--- a/jena-arq/src/main/java/org/apache/jena/riot/tokens/TokenizerFactory.java
+++ b/jena-arq/src/main/java/org/apache/jena/riot/tokens/TokenizerFactory.java
@@ -16,49 +16,53 @@
* limitations under the License.
*/
-package org.apache.jena.riot.tokens;
+package org.apache.jena.riot.tokens ;
import java.io.ByteArrayInputStream ;
import java.io.InputStream ;
import java.io.Reader ;
+import java.io.StringReader ;
import org.apache.jena.atlas.io.PeekReader ;
import org.apache.jena.atlas.lib.StrUtils ;
-public class TokenizerFactory
-{
- /** Discouraged - be careful about character sets */
- public static Tokenizer makeTokenizer(Reader reader)
- {
+public class TokenizerFactory {
+
+ /** Discouraged - be careful about character sets */
+ @Deprecated
+ public static Tokenizer makeTokenizer(Reader reader) {
PeekReader peekReader = PeekReader.make(reader) ;
Tokenizer tokenizer = new TokenizerText(peekReader) ;
return tokenizer ;
}
-
- public static Tokenizer makeTokenizerUTF8(InputStream in)
- {
+
+ /** Discouraged - be careful about character sets */
+ public static Tokenizer makeTokenizer(StringReader reader) {
+ PeekReader peekReader = PeekReader.make(reader) ;
+ Tokenizer tokenizer = new TokenizerText(peekReader) ;
+ return tokenizer ;
+ }
+
+ public static Tokenizer makeTokenizerUTF8(InputStream in) {
// BOM will be removed
PeekReader peekReader = PeekReader.makeUTF8(in) ;
Tokenizer tokenizer = new TokenizerText(peekReader) ;
return tokenizer ;
}
-
- public static Tokenizer makeTokenizerASCII(InputStream in)
- {
+
+ public static Tokenizer makeTokenizerASCII(InputStream in) {
PeekReader peekReader = PeekReader.makeASCII(in) ;
Tokenizer tokenizer = new TokenizerText(peekReader) ;
return tokenizer ;
}
-
- public static Tokenizer makeTokenizerASCII(String string)
- {
+
+ public static Tokenizer makeTokenizerASCII(String string) {
byte b[] = StrUtils.asUTF8bytes(string) ;
ByteArrayInputStream in = new ByteArrayInputStream(b) ;
return makeTokenizerASCII(in) ;
}
-
- public static Tokenizer makeTokenizerString(String str)
- {
+
+ public static Tokenizer makeTokenizerString(String str) {
PeekReader peekReader = PeekReader.readString(str) ;
Tokenizer tokenizer = new TokenizerText(peekReader) ;
return tokenizer ;
http://git-wip-us.apache.org/repos/asf/jena/blob/0cac294e/jena-arq/src/main/java/org/apache/jena/riot/tokens/TokenizerText.java
----------------------------------------------------------------------
diff --git
a/jena-arq/src/main/java/org/apache/jena/riot/tokens/TokenizerText.java
b/jena-arq/src/main/java/org/apache/jena/riot/tokens/TokenizerText.java
index 5b03485..ba420bb 100644
--- a/jena-arq/src/main/java/org/apache/jena/riot/tokens/TokenizerText.java
+++ b/jena-arq/src/main/java/org/apache/jena/riot/tokens/TokenizerText.java
@@ -37,7 +37,7 @@ public final class TokenizerText implements Tokenizer
// TODO Remove CNTL and make SYMBOLS
// Drop through to final general symbol/keyword reader, including <=, !=
// Care with <=
- // STRING, not STIRNG1/2, LONG_STRING1,2
+ // STRING, not STRING1/2, LONG_STRING1/2
// Policy driven for CURIES?
// Various allow/deny options (via checker?)
@@ -432,36 +432,36 @@ public final class TokenizerText implements Tokenizer
stringBuilder.setLength(0) ;
for (;;) {
int ch = reader.readChar() ;
- if ( ch == EOF )
- exception("Broken IRI (End of file): %s",
stringBuilder.toString()) ;
- if ( ch == '\n' )
- exception("Broken IRI (newline): %s",
stringBuilder.toString()) ;
- if ( ch == '\r' )
- exception("Broken IRI (CR): %s", stringBuilder.toString()) ;
- if ( ch == CH_GT )
- return stringBuilder.toString() ;
- if ( ch == '\\' ) {
- if ( VeryVeryLax )
- ch = readCharEscapeAnyURI() ;
- else
- // NORMAL
- ch = readUnicodeEscape() ;
- // Drop through.
+ switch(ch) {
+ case EOF:
+ exception("Broken IRI (End of file): %s",
stringBuilder.toString()) ;
+ case NL:
+ exception("Broken IRI (newline): %s",
stringBuilder.toString()) ;
+ case CR:
+ exception("Broken IRI (CR): %s", stringBuilder.toString())
;
+ case CH_GT:
+ // Done!
+ return stringBuilder.toString() ;
+ case CH_RSLASH:
+ if ( VeryVeryLax )
+ ch = readCharEscapeAnyURI() ;
+ else
+ // NORMAL
+ ch = readUnicodeEscape() ;
+ break ;
}
if ( !VeryVeryLax ) {
- // JENA-911
-// if ( ch == 0x09 )
-// exception("Broken IRI (Tab character): %s",
stringBuilder.toString()) ;
-// if ( ch <= 0x19 )
-// exception("Broken IRI (control char 0x%02X): %s", ch,
stringBuilder.toString()) ;
-// if ( ch == 0x20 )
-// exception("Broken IRI (space): %s...",
stringBuilder.toString()) ;
-// if ( ch == '"' || ch == '{' || ch == '}' || ch == '|' || ch
== '^' || ch == '`')
-// exception("Broken IRI (Illegal character 0x%02X, '%c'):
%s", ch, (char)ch, stringBuilder.toString()) ;
- // Ban certain very bad characters
- if ( ch == '<' )
- exception("Broken IRI (bad character: '%c'): %s", ch,
stringBuilder.toString()) ;
+ if ( ch == CH_LT ) // '<' -- very bad
+ exception("Broken IRI (bad character: '%c'): %s",
(char)ch, stringBuilder.toString()) ;
+ if ( ch == TAB )
+ exception("Broken IRI (Tab character): %s",
stringBuilder.toString()) ;
+ if ( ch <= 0x19 )
+ exception("Broken IRI (control char 0x%02X): %s", ch,
stringBuilder.toString()) ;
+ if ( ch == SPC )
+ exception("Broken IRI (space): %s...",
stringBuilder.toString()) ;
+ if ( ch == '"' || ch == '{' || ch == '}' || ch == '|' || ch ==
'^' || ch == '`')
+ exception("Broken IRI (Illegal character 0x%02X, '%c'):
%s", ch, (char)ch, stringBuilder.toString()) ;
}
insertCodepoint(stringBuilder, ch) ;
}
http://git-wip-us.apache.org/repos/asf/jena/blob/0cac294e/jena-arq/src/test/java/org/apache/jena/riot/lang/TestLangNTuples.java
----------------------------------------------------------------------
diff --git
a/jena-arq/src/test/java/org/apache/jena/riot/lang/TestLangNTuples.java
b/jena-arq/src/test/java/org/apache/jena/riot/lang/TestLangNTuples.java
index d1de6e3..a401195 100644
--- a/jena-arq/src/test/java/org/apache/jena/riot/lang/TestLangNTuples.java
+++ b/jena-arq/src/test/java/org/apache/jena/riot/lang/TestLangNTuples.java
@@ -123,22 +123,22 @@ abstract public class TestLangNTuples extends BaseTest
}
// Bad terms - but accepted by default.
- @Test
+ @Test(expected=ExFatal.class)
public void tuple_bad_10() { parseCount("<x> <p> <bad uri> .") ; }
- // Bad terms - but accepted by default.
+ // Bad terms (value range) - but legal syntax
@Test
public void tuple_bad_11() { parseCount("<x> <p>
\"9000\"^^<http://www.w3.org/2001/XMLSchema#byte> .") ; }
- // Bad terms - but accepted by default.
- @Test (expected=ExError.class)
+ // Bad - relative URI.
+ @Test(expected=ExError.class)
public void tuple_bad_21() { parseCheck("<x> <p> <z> .") ; }
- // Bad terms - with checking.
- @Test (expected=ExWarning.class)
- public void tuple_bad_22() { parseCheck("<http://example/x>
<http://example/p> <http://example/bad uri> .") ; }
+ // Bad terms
+ @Test(expected=ExFatal.class)
+ public void tuple_bad_22() { parseCheck("<http://example/x>
<http://example/p> \"abc\"^^<http://example/bad uri> .") ; }
- @Test (expected=ExWarning.class)
+ @Test(expected=ExWarning.class)
public void tuple_bad_23() { parseCheck("<http://example/x>
<http://example/p> \"9000\"^^<http://www.w3.org/2001/XMLSchema#byte> .") ; }
// ASCII vs UTF-8
http://git-wip-us.apache.org/repos/asf/jena/blob/0cac294e/jena-arq/src/test/java/org/apache/jena/riot/lang/TestLangTrig.java
----------------------------------------------------------------------
diff --git a/jena-arq/src/test/java/org/apache/jena/riot/lang/TestLangTrig.java
b/jena-arq/src/test/java/org/apache/jena/riot/lang/TestLangTrig.java
index 5b47f42..bb04e29 100644
--- a/jena-arq/src/test/java/org/apache/jena/riot/lang/TestLangTrig.java
+++ b/jena-arq/src/test/java/org/apache/jena/riot/lang/TestLangTrig.java
@@ -21,10 +21,10 @@ package org.apache.jena.riot.lang;
import org.apache.jena.atlas.junit.BaseTest ;
import org.apache.jena.atlas.lib.StrUtils ;
import org.apache.jena.graph.Triple ;
-import org.apache.jena.riot.ErrorHandlerTestLib ;
+import org.apache.jena.riot.* ;
import org.apache.jena.riot.ErrorHandlerTestLib.ErrorHandlerEx ;
+import org.apache.jena.riot.ErrorHandlerTestLib.ExFatal ;
import org.apache.jena.riot.ErrorHandlerTestLib.ExWarning ;
-import org.apache.jena.riot.RiotReader ;
import org.apache.jena.riot.system.StreamRDF ;
import org.apache.jena.riot.system.StreamRDFLib ;
import org.apache.jena.riot.tokens.Tokenizer ;
@@ -72,13 +72,13 @@ public class TestLangTrig extends BaseTest
// Also need to check that the RiotExpection is called in normal use.
// Bad terms.
- @Test (expected=ExWarning.class)
+ @Test (expected=ExFatal.class)
public void trig_20() { parse("@prefix ex: <bad iri> .", "{ ex:s ex:p
123 }") ; }
- @Test (expected=ExWarning.class)
+ @Test (expected=ExFatal.class)
public void trig_21() { parse("@prefix ex: <http://example/> .", "{
ex:s <http://example/broken p> 123 }") ; }
- @Test (expected=ExWarning.class)
+ @Test (expected=ExFatal.class)
public void trig_22() { parse("{ <x> <p> 'number'^^<bad uri> }") ; }
@Test (expected=ExWarning.class)
http://git-wip-us.apache.org/repos/asf/jena/blob/0cac294e/jena-arq/src/test/java/org/apache/jena/riot/lang/TestLangTurtle.java
----------------------------------------------------------------------
diff --git
a/jena-arq/src/test/java/org/apache/jena/riot/lang/TestLangTurtle.java
b/jena-arq/src/test/java/org/apache/jena/riot/lang/TestLangTurtle.java
index 2c59a58..be1bdc0 100644
--- a/jena-arq/src/test/java/org/apache/jena/riot/lang/TestLangTurtle.java
+++ b/jena-arq/src/test/java/org/apache/jena/riot/lang/TestLangTurtle.java
@@ -22,7 +22,6 @@ import static
org.apache.jena.riot.system.ErrorHandlerFactory.errorHandlerNoLogg
import static
org.apache.jena.riot.system.ErrorHandlerFactory.getDefaultErrorHandler ;
import static
org.apache.jena.riot.system.ErrorHandlerFactory.setDefaultErrorHandler ;
-import java.io.Reader ;
import java.io.StringReader ;
import org.apache.jena.atlas.junit.BaseTest ;
@@ -135,7 +134,7 @@ public class TestLangTurtle extends BaseTest
private static Graph parse(String ...strings)
{
String string = StrUtils.strjoin("\n", strings) ;
- Reader reader = new StringReader(string) ;
+ StringReader reader = new StringReader(string) ;
String baseIRI = "http://base/" ;
Tokenizer tokenizer = TokenizerFactory.makeTokenizer(reader) ;
@@ -182,7 +181,7 @@ public class TestLangTurtle extends BaseTest
@Test(expected=ExFatal.class)
public void errorBadDatatype() { parse("<p> <p> 'q'^^.") ; }
- @Test(expected=ExWarning.class)
+ @Test(expected=RiotException.class)
public void errorBadURI_1()
{ parse("<http://example/a b> <http://example/p> 123 .") ; }
@@ -195,25 +194,29 @@ public class TestLangTurtle extends BaseTest
public void errorBadURI_3()
{ parse("<http://example/a%Aab> <http://example/p> 123 .") ; }
+ // Bad URIs
+ @Test (expected=ExFatal.class)
+ public void errorBadURI_4() { parse("@prefix ex: <bad iri> . ex:s
ex:p 123 ") ; }
+
+ @Test (expected=ExFatal.class)
+ public void errorBadURI_5() { parse("<x> <p> 'number'^^<bad uri> ") ; }
+
@Test
- public void turtle_01()
- {
+ public void turtle_01() {
Triple t = parseOneTriple("<s> <p> 123 . ") ;
Triple t2 = SSE.parseTriple("(<http://base/s> <http://base/p> 123)") ;
assertEquals(t2, t) ;
}
@Test
- public void turtle_02()
- {
+ public void turtle_02() {
Triple t = parseOneTriple("@base <http://example/> . <s> <p> 123 . ") ;
Triple t2 = SSE.parseTriple("(<http://example/s> <http://example/p>
123)") ;
assertEquals(t2, t) ;
}
@Test
- public void turtle_03()
- {
+ public void turtle_03() {
Triple t = parseOneTriple("@prefix ex: <http://example/x/> . ex:s ex:p
123 . ") ;
Triple t2 = SSE.parseTriple("(<http://example/x/s>
<http://example/x/p> 123)") ;
assertEquals(t2, t) ;
@@ -223,16 +226,6 @@ public class TestLangTurtle extends BaseTest
@Test (expected=ExFatal.class)
public void turtle_10() { parse("@prefix ex: <http://example/> . {
ex:s ex:p 123 . } ") ; }
- // Bad terms.
- @Test (expected=ExWarning.class)
- public void turtle_20() { parse("@prefix ex: <bad iri> . ex:s ex:p
123 ") ; }
-
- @Test (expected=ExWarning.class)
- public void turtle_21() { parse("@prefix ex: <http://example/> . ex:s
<http://example/broken p> 123") ; }
-
- @Test (expected=ExWarning.class)
- public void turtle_22() { parse("<x> <p> 'number'^^<bad uri> ") ; }
-
@Test (expected=ExWarning.class)
- public void turtle_23() { parse("@prefix xsd:
<http://www.w3.org/2001/XMLSchema#> . <x> <p> 'number'^^xsd:byte }") ; }
+ public void turtle_20() { parse("@prefix xsd:
<http://www.w3.org/2001/XMLSchema#> . <x> <p> 'number'^^xsd:byte }") ; }
}
http://git-wip-us.apache.org/repos/asf/jena/blob/0cac294e/jena-arq/src/test/java/org/apache/jena/riot/tokens/TestTokenizer.java
----------------------------------------------------------------------
diff --git
a/jena-arq/src/test/java/org/apache/jena/riot/tokens/TestTokenizer.java
b/jena-arq/src/test/java/org/apache/jena/riot/tokens/TestTokenizer.java
index 89b9cb8..a85c861 100644
--- a/jena-arq/src/test/java/org/apache/jena/riot/tokens/TestTokenizer.java
+++ b/jena-arq/src/test/java/org/apache/jena/riot/tokens/TestTokenizer.java
@@ -23,6 +23,7 @@ import java.io.ByteArrayInputStream ;
import org.apache.jena.atlas.io.PeekReader ;
import org.apache.jena.atlas.junit.BaseTest ;
import org.apache.jena.atlas.lib.StrUtils ;
+import org.apache.jena.riot.RiotException ;
import org.apache.jena.riot.RiotParseException ;
import org.apache.jena.sparql.ARQConstants ;
import org.junit.Test ;
@@ -152,6 +153,52 @@ public class TestTokenizer extends BaseTest {
tokenizeAndTestFirst("<abc\\u0041def> 123", TokenType.IRI,
"abcAdef") ;
}
+ // Bad IRIs
+ @Test(expected=RiotException.class)
+ public void tokenUnit_iri10() {
+ tokenFirst("<abc def>") ;
+ }
+
+ @Test(expected=RiotException.class)
+ public void tokenUnit_iri11() {
+ tokenFirst("<abc<def>") ;
+ }
+
+ @Test(expected=RiotException.class)
+ public void tokenUnit_iri12() {
+ tokenFirst("<abc{def>") ;
+ }
+
+ @Test(expected=RiotException.class)
+ public void tokenUnit_iri13() {
+ tokenFirst("<abc}def>") ;
+ }
+
+ @Test(expected=RiotException.class)
+ public void tokenUnit_iri14() {
+ tokenFirst("<abc|def>") ;
+ }
+
+ @Test(expected=RiotException.class)
+ public void tokenUnit_iri15() {
+ tokenFirst("<abc^def>") ;
+ }
+
+ @Test(expected=RiotException.class)
+ public void tokenUnit_iri16() {
+ tokenFirst("<abc`def>") ;
+ }
+
+ @Test(expected=RiotException.class)
+ public void tokenUnit_iri17() {
+ tokenFirst("<abc\tdef>") ; // Java escae - real tab
+ }
+
+ @Test(expected=RiotException.class)
+ public void tokenUnit_iri18() {
+ tokenFirst("<abc\u0007def>") ; // Java escape - codepoint 7
+ }
+
@Test
public void tokenUnit_str1() {
tokenizeAndTestExact(" 'abc' ", TokenType.STRING1, "abc") ;