Repository: jena
Updated Branches:
  refs/heads/master fd44d834a -> fb101b828


Clean tokenizer; support for warnings (and continue)


Project: http://git-wip-us.apache.org/repos/asf/jena/repo
Commit: http://git-wip-us.apache.org/repos/asf/jena/commit/4d130f50
Tree: http://git-wip-us.apache.org/repos/asf/jena/tree/4d130f50
Diff: http://git-wip-us.apache.org/repos/asf/jena/diff/4d130f50

Branch: refs/heads/master
Commit: 4d130f509b18eea6136af2c8820977e0877e9733
Parents: fd44d83
Author: Andy Seaborne <[email protected]>
Authored: Wed Oct 26 20:42:18 2016 +0100
Committer: Andy Seaborne <[email protected]>
Committed: Thu Oct 27 09:58:41 2016 +0100

----------------------------------------------------------------------
 .../org/apache/jena/riot/lang/LangNTriples.java |   4 -
 .../jena/riot/system/ParserProfileChecker.java  |   6 +
 .../apache/jena/riot/tokens/TokenizerText.java  | 146 ++++++++++++-------
 3 files changed, 99 insertions(+), 57 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/jena/blob/4d130f50/jena-arq/src/main/java/org/apache/jena/riot/lang/LangNTriples.java
----------------------------------------------------------------------
diff --git a/jena-arq/src/main/java/org/apache/jena/riot/lang/LangNTriples.java 
b/jena-arq/src/main/java/org/apache/jena/riot/lang/LangNTriples.java
index 374bf07..535c3f8 100644
--- a/jena-arq/src/main/java/org/apache/jena/riot/lang/LangNTriples.java
+++ b/jena-arq/src/main/java/org/apache/jena/riot/lang/LangNTriples.java
@@ -84,10 +84,6 @@ public final class LangNTriples extends LangNTuple<Triple>
         
         if ( x.getType() != TokenType.DOT )
             exception(x, "Triple not terminated by DOT: %s", x) ;
-//        Node s = X ;
-//        Node p = X ;
-//        Node o = X ;
-//        return T ;
         
         Node s = tokenAsNode(sToken) ;
         Node p = tokenAsNode(pToken) ;

http://git-wip-us.apache.org/repos/asf/jena/blob/4d130f50/jena-arq/src/main/java/org/apache/jena/riot/system/ParserProfileChecker.java
----------------------------------------------------------------------
diff --git 
a/jena-arq/src/main/java/org/apache/jena/riot/system/ParserProfileChecker.java 
b/jena-arq/src/main/java/org/apache/jena/riot/system/ParserProfileChecker.java
index aa33649..a748aa8 100644
--- 
a/jena-arq/src/main/java/org/apache/jena/riot/system/ParserProfileChecker.java
+++ 
b/jena-arq/src/main/java/org/apache/jena/riot/system/ParserProfileChecker.java
@@ -61,6 +61,12 @@ public class ParserProfileChecker extends ParserProfileBase 
// implements Parser
     public IRI makeIRI(String uriStr, long line, long col) {
         // resolves, but we handle the errors and warnings.
         IRI iri = prologue.getResolver().resolveSilent(uriStr) ;
+        if ( uriStr.contains(" ") ) {
+            // Specific check for spaces.
+            errorHandler.warning("Bad IRI: <"+uriStr+"> Spaces are not legal 
in URIs/IRIs.", line, col);
+            return iri ; 
+        } 
+        // At this point, IRI "errors" are warnings. 
         CheckerIRI.iriViolations(iri, errorHandler, line, col) ;
         return iri ;
     }

http://git-wip-us.apache.org/repos/asf/jena/blob/4d130f50/jena-arq/src/main/java/org/apache/jena/riot/tokens/TokenizerText.java
----------------------------------------------------------------------
diff --git 
a/jena-arq/src/main/java/org/apache/jena/riot/tokens/TokenizerText.java 
b/jena-arq/src/main/java/org/apache/jena/riot/tokens/TokenizerText.java
index 6537d06..eb6e707 100644
--- a/jena-arq/src/main/java/org/apache/jena/riot/tokens/TokenizerText.java
+++ b/jena-arq/src/main/java/org/apache/jena/riot/tokens/TokenizerText.java
@@ -18,8 +18,8 @@
 
 package org.apache.jena.riot.tokens;
 
-import static org.apache.jena.atlas.lib.Chars.* ;
-import static org.apache.jena.riot.system.RiotChars.* ;
+import static org.apache.jena.atlas.lib.Chars.*;
+import static org.apache.jena.riot.system.RiotChars.*;
 
 import java.util.NoSuchElementException ;
 
@@ -28,6 +28,7 @@ import org.apache.jena.atlas.io.IO ;
 import org.apache.jena.atlas.io.PeekReader ;
 import org.apache.jena.atlas.lib.Chars ;
 import org.apache.jena.riot.RiotParseException ;
+import org.apache.jena.riot.system.ErrorHandler;
 import org.apache.jena.riot.system.RiotChars ;
 import org.apache.jena.sparql.ARQInternalErrorException ;
 
@@ -60,6 +61,26 @@ public final class TokenizerText implements Tokenizer
     private boolean finished = false ;
     private TokenChecker checker = null ;
 
+    // The code assumes that errors throw exception and so stop parsing.
+    private ErrorHandler errorHandler = new ErrorHandler() {
+        @Override
+        public void warning(String message, long line, long col) {
+            // Warning/continue.
+            //ErrorHandlerFactory.errorHandlerStd.warning(message, line, col);
+            throw new RiotParseException(message, line, col) ;
+        }
+
+        @Override
+        public void error(String message, long line, long col) {
+            throw new RiotParseException(message, line, col) ;
+        }
+
+        @Override
+        public void fatal(String message, long line, long col) {
+            throw new RiotParseException(message, line, col) ;
+        }
+    } ;
+
     /*package*/ TokenizerText(PeekReader reader) {
         this(reader, false) ;
     }
@@ -124,11 +145,25 @@ public final class TokenizerText implements Tokenizer
     }
     
     @Override
-    public void remove()                            { throw new 
UnsupportedOperationException() ; }
+    public void remove()                            
+    { throw new UnsupportedOperationException() ; }
 
-    public TokenChecker getChecker()                { return checker ; }
-    public void setChecker(TokenChecker checker)    { this.checker = checker ; 
}
+    public TokenChecker getChecker() {
+        return checker;
+    }
+
+    public void setChecker(TokenChecker checker) {
+        this.checker = checker;
+    }
+
+    public ErrorHandler getErrorHandler() {
+        return errorHandler;
+    }
 
+    public void setErrorHandler(ErrorHandler handler) {
+        this.errorHandler = handler;
+    }
+    
     @Override
     public void close() {
         IO.close(reader) ;
@@ -243,7 +278,7 @@ public final class TokenizerText implements Tokenizer
 
                 Token subToken = parseToken() ;
                 if ( !subToken.isIRI() )
-                    exception("Datatype URI required after ^^ - URI or 
prefixed name expected") ;
+                    error("Datatype URI required after ^^ - URI or prefixed 
name expected") ;
 
                 mainToken.setSubToken2(subToken) ;
                 mainToken.setType(TokenType.LITERAL_DT) ;
@@ -275,7 +310,7 @@ public final class TokenizerText implements Tokenizer
             token.setType(TokenType.CNTRL) ;
             ch = reader.readChar() ;
             if ( ch == EOF )
-                exception("EOF found after " + CTRL_CHAR) ;
+                error("EOF found after " + CTRL_CHAR) ;
             if ( RiotChars.isWhitespace(ch) )
                 token.cntrlCode = -1 ;
             else
@@ -428,6 +463,8 @@ public final class TokenizerText implements Tokenizer
 
     
     private static final boolean VeryVeryLaxIRI = false ;
+    // Spaces in IRI are illegal.
+    private static final boolean AllowSpacesInIRI = false ;
     
     // [8]  IRIREF  ::= '<' ([^#x00-#x20<>"{}|^`\] | UCHAR)* '>'
     private String readIRI() {
@@ -436,11 +473,11 @@ public final class TokenizerText implements Tokenizer
             int ch = reader.readChar() ;
             switch(ch) {
                 case EOF:
-                    exception("Broken IRI (End of file)") ;
+                    error("Broken IRI (End of file)") ;
                 case NL:
-                    exception("Broken IRI (newline): %s", 
stringBuilder.toString()) ;
+                    error("Broken IRI (newline): %s", 
stringBuilder.toString()) ;
                 case CR:
-                    exception("Broken IRI (CR): %s", stringBuilder.toString()) 
;
+                    error("Broken IRI (CR): %s", stringBuilder.toString()) ;
                 case CH_GT:
                     // Done!
                     return stringBuilder.toString() ;
@@ -457,17 +494,20 @@ public final class TokenizerText implements Tokenizer
                     break ;
                 case CH_LT:
                     // Probably a corrupt file so not a warning.
-                    exception("Bad character in IRI (bad character: '<'): 
<%s<...>", stringBuilder.toString()) ;
+                    error("Bad character in IRI (bad character: '<'): 
<%s[<]...>", stringBuilder.toString()) ;
                 case TAB:
-                    exception("Bad character in IRI (Tab character): 
<%s[tab]...>", stringBuilder.toString()) ;
-                case SPC:
-                    warning("Bad character in IRI (space): <%s[space]...>", 
stringBuilder.toString()) ;
+                    error("Bad character in IRI (Tab character): 
<%s[tab]...>", stringBuilder.toString()) ;
                 case '{': case '}': case '"': case '|': case '^': case '`' :
                     if ( ! VeryVeryLaxIRI )
                         warning("Illegal character in IRI (codepoint 0x%02X, 
'%c'): <%s[%c]...>", ch, (char)ch, stringBuilder.toString(), (char)ch) ;
+                    break ;
+                case SPC:
+                    if ( ! AllowSpacesInIRI )
+                        warning("Bad character in IRI (space): 
<%s[space]...>", stringBuilder.toString()) ;
+                    break ;
                 default:
                     if ( ch <= 0x19 )
-                        warning("Illegal character in IRI (control char 
0x%02X): %s", ch, stringBuilder.toString()) ;
+                        warning("Illegal character in IRI (control char 
0x%02X): <%s[0x%02X]...>", ch, stringBuilder.toString()) ;
             }
             insertCodepoint(stringBuilder, ch) ;
         }
@@ -477,13 +517,13 @@ public final class TokenizerText implements Tokenizer
     private final int readUnicodeEscape() {
         int ch = reader.readChar() ;
         if ( ch == EOF )
-            exception("Broken escape sequence") ;
+            error("Broken escape sequence") ;
     
         switch (ch) {
             case 'u': return readUnicode4Escape(); 
             case 'U': return readUnicode8Escape(); 
             default:
-                exception("Illegal unicode escape sequence value: \\%c 
(0x%02X)", ch, ch);
+                error("Illegal unicode escape sequence value: \\%c (0x%02X)", 
ch, ch);
         }
         return 0 ;
     }
@@ -506,7 +546,7 @@ public final class TokenizerText implements Tokenizer
         // If we made no progress, nothing found, not even a keyword -- it's an
         // error.
         if ( posn == reader.getPosition() )
-            exception("Failed to find a prefix name or keyword: 
%c(%d;0x%04X)", ch, ch, ch) ;
+            error("Failed to find a prefix name or keyword: %c(%d;0x%04X)", 
ch, ch, ch) ;
 
         if ( Checking )
             checkKeyword(token.getImage()) ;
@@ -629,13 +669,13 @@ public final class TokenizerText implements Tokenizer
 
             ch = reader.peekChar() ;
             if ( ! isHexChar(ch) )
-                exception("Not a hex charcater: '%c'",ch) ;
+                error("Not a hex charcater: '%c'",ch) ;
             stringBuilder.append((char)ch) ;
             reader.readChar() ;
 
             ch = reader.peekChar() ;
             if ( ! isHexChar(ch) )
-                exception("Not a hex charcater: '%c'",ch) ;
+                error("Not a hex charcater: '%c'",ch) ;
             stringBuilder.append((char)ch) ;
             reader.readChar() ;
         }
@@ -661,11 +701,11 @@ public final class TokenizerText implements Tokenizer
             int ch = reader.readChar() ;
             if ( ch == EOF ) {
                 // if ( endNL ) return stringBuilder.toString() ;
-                exception("Broken token: " + stringBuilder.toString(), y, x) ;
+                error("Broken token: " + stringBuilder.toString(), y, x) ;
             }
 
             if ( ch == NL )
-                exception("Broken token (newline): " + 
stringBuilder.toString(), y, x) ;
+                error("Broken token (newline): " + stringBuilder.toString(), 
y, x) ;
 
             if ( ch == endCh ) {
                 return stringBuilder.toString() ;
@@ -684,7 +724,7 @@ public final class TokenizerText implements Tokenizer
             if ( ch == EOF ) {
                 if ( endNL )
                     return stringBuilder.toString() ;
-                exception("Broken long string") ;
+                error("Broken long string") ;
             }
 
             if ( ch == quoteChar ) {
@@ -771,14 +811,14 @@ public final class TokenizerText implements Tokenizer
         {
             int ch = reader.peekChar() ;
             if ( ch == EOF )
-                exception("Blank node label missing (EOF found)") ;
+                error("Blank node label missing (EOF found)") ;
             if ( isWhitespace(ch) )
-                exception("Blank node label missing") ;
+                error("Blank node label missing") ;
             // if ( ! isAlpha(ch) && ch != '_' )
             // Not strict
 
             if ( !RiotChars.isPNChars_U_N(ch) )
-                exception("Blank node label does not start with alphabetic or 
_ :" + (char)ch) ;
+                error("Blank node label does not start with alphabetic or _ :" 
+ (char)ch) ;
             reader.readChar() ;
             stringBuilder.append((char)ch) ;
         }
@@ -879,7 +919,7 @@ public final class TokenizerText implements Tokenizer
         if ( x == 0 && !isDecimal )
             // Possible a tokenizer error - should not have entered readNumber
             // in the first place.
-            exception("Unrecognized as number") ;
+            error("Unrecognized as number") ;
 
         if ( exponent(stringBuilder) ) {
             isDouble = true ;
@@ -905,7 +945,7 @@ public final class TokenizerText implements Tokenizer
             token.setType(TokenType.INTEGER) ;
     }
 
-    private static void readHex(PeekReader reader, StringBuilder sb) {
+    private void readHex(PeekReader reader, StringBuilder sb) {
         // Just after the 0x, which are in sb
         int x = 0 ;
         for (;;) {
@@ -918,7 +958,7 @@ public final class TokenizerText implements Tokenizer
             x++ ;
         }
         if ( x == 0 )
-            exception(reader, "No hex characters after " + sb.toString()) ;
+            error("No hex characters after " + sb.toString()) ;
     }
 
     private int readDigits(StringBuilder buffer) {
@@ -976,7 +1016,7 @@ public final class TokenizerText implements Tokenizer
         readPossibleSign(sb) ;
         int x = readDigits(sb) ;
         if ( x == 0 )
-            exception("Malformed double: " + sb) ;
+            error("Malformed double: " + sb) ;
         return true ;
     }
 
@@ -984,7 +1024,7 @@ public final class TokenizerText implements Tokenizer
         stringBuilder.setLength(0) ;
         a2z(stringBuilder) ;
         if ( stringBuilder.length() == 0 )
-            exception("Bad language tag") ;
+            error("Bad language tag") ;
         for (;;) {
             int ch = reader.peekChar() ;
             if ( ch == '-' ) {
@@ -993,7 +1033,7 @@ public final class TokenizerText implements Tokenizer
                 int x = stringBuilder.length() ;
                 a2zN(stringBuilder) ;
                 if ( stringBuilder.length() == x )
-                    exception("Bad language tag") ;
+                    error("Bad language tag") ;
             } else
                 break ;
         }
@@ -1030,7 +1070,7 @@ public final class TokenizerText implements Tokenizer
             // Convert to UTF-16. Note that the rest of any system this is used
             // in must also respect codepoints and surrogate pairs.
             if ( !Character.isDefined(ch) && 
!Character.isSupplementaryCodePoint(ch) )
-                exception("Illegal codepoint: 0x%04X", ch) ;
+                error("Illegal codepoint: 0x%04X", ch) ;
             char[] chars = Character.toChars(ch) ;
             buffer.append(chars) ;
         }
@@ -1108,7 +1148,7 @@ public final class TokenizerText implements Tokenizer
     private final int readLiteralEscape() {
         int c = reader.readChar() ;
         if ( c == EOF )
-            exception("Escape sequence not completed") ;
+            error("Escape sequence not completed") ;
 
         switch (c) {
             case 'n':   return NL ; 
@@ -1122,7 +1162,7 @@ public final class TokenizerText implements Tokenizer
             case 'u':   return readUnicode4Escape();
             case 'U':   return readUnicode8Escape();
             default:
-                exception("Illegal escape sequence value: %c (0x%02X)", c, c);
+                error("Illegal escape sequence value: %c (0x%02X)", c, c);
                 return 0 ;
         }
     }
@@ -1134,7 +1174,7 @@ public final class TokenizerText implements Tokenizer
 
         int c = reader.readChar() ;
         if ( c == EOF )
-            exception("Escape sequence not completed") ;
+            error("Escape sequence not completed") ;
 
         switch (c) {
             case '_': case '~': case '.':  case '-':  case '!':  case '$':  
case '&': 
@@ -1143,7 +1183,7 @@ public final class TokenizerText implements Tokenizer
             case '=':  case '/':  case '?':  case '#':  case '@':  case '%':
                 return c ;
             default:
-                exception("illegal character escape value: \\%c", c);
+                error("illegal character escape value: \\%c", c);
                 return 0 ;
         }
     }
@@ -1154,7 +1194,7 @@ public final class TokenizerText implements Tokenizer
     private final int readUnicode8Escape() {
         int ch8 = readHexSequence(8) ;
         if ( ch8 > Character.MAX_CODE_POINT )
-            exception("Illegal code point in \\U sequence value: 0x%08X", ch8) 
;
+            error("Illegal code point in \\U sequence value: 0x%08X", ch8) ;
         return ch8 ;
     }
 
@@ -1172,12 +1212,12 @@ public final class TokenizerText implements Tokenizer
     private final int readHexChar() {
         int ch = reader.readChar() ;
         if ( ch == EOF )
-            exception("Not a hexadecimal character (end of file)") ;
+            error("Not a hexadecimal character (end of file)") ;
 
         int x = valHexChar(ch) ;
         if ( x != -1 )
             return x ;
-        exception("Not a hexadecimal character: " + (char)ch) ;
+        error("Not a hexadecimal character: " + (char)ch) ;
         return -1 ;
     }
 
@@ -1185,13 +1225,13 @@ public final class TokenizerText implements Tokenizer
         for (int i = 0; i < str.length(); i++) {
             char want = str.charAt(i) ;
             if ( reader.eof() ) {
-                exception("End of input during expected string: " + str) ;
+                error("End of input during expected string: " + str) ;
                 return false ;
             }
             int inChar = reader.peekChar() ;
             if ( inChar != want ) {
                 // System.err.println("N-triple reader error");
-                exception("expected \"" + str + "\"") ;
+                error("expected \"" + str + "\"") ;
                 return false ;
             }
             reader.readChar() ;
@@ -1200,18 +1240,18 @@ public final class TokenizerText implements Tokenizer
     }
 
     private void warning(String message, Object... args) {
-        exception(message, args); 
+        String msg = String.format(message, args) ;
+        errorHandler.warning(msg, reader.getLineNum(), reader.getColNum()) ;
+        //exception(message, args); 
     }
     
-    private void exception(String message, Object... args) {
-        exception$(message, reader.getLineNum(), reader.getColNum(), args) ;
-    }
-
-    private static void exception(PeekReader reader, String message, Object... 
args) {
-        exception$(message, reader.getLineNum(), reader.getColNum(), args) ;
-    }
-
-    private static void exception$(String message, long line, long col, 
Object... args) {
-        throw new RiotParseException(String.format(message, args), line, col) ;
+    private void error(String message, Object... args) {
+        String msg = String.format(message, args) ;
+        long line = reader.getLineNum() ;
+        long col = reader.getColNum() ;
+        errorHandler.error(msg, line, col) ;
+        // We require that errors cause the tokenizer to stop so in case the
+        // provided error handler does not, we throw an exception.
+        throw new RiotParseException(message, line, col) ;
     }
 }

Reply via email to