convert NQuadsParser to override sesame-rio-nquads parser with UTF-8 support 
for InputStreamReader constructor


Project: http://git-wip-us.apache.org/repos/asf/any23/repo
Commit: http://git-wip-us.apache.org/repos/asf/any23/commit/34cee606
Tree: http://git-wip-us.apache.org/repos/asf/any23/tree/34cee606
Diff: http://git-wip-us.apache.org/repos/asf/any23/diff/34cee606

Branch: refs/heads/ANY23-141
Commit: 34cee606fee6f2bcc0b319976af04c6369ca956b
Parents: 384944f
Author: Peter Ansell <[email protected]>
Authored: Wed Oct 24 14:16:34 2012 +1000
Committer: Peter Ansell <[email protected]>
Committed: Fri Apr 19 13:35:45 2013 +1000

----------------------------------------------------------------------
 .../org/apache/any23/io/nquads/NQuadsParser.java   |  694 +--------------
 1 files changed, 1 insertions(+), 693 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/any23/blob/34cee606/nquads/src/main/java/org/apache/any23/io/nquads/NQuadsParser.java
----------------------------------------------------------------------
diff --git a/nquads/src/main/java/org/apache/any23/io/nquads/NQuadsParser.java 
b/nquads/src/main/java/org/apache/any23/io/nquads/NQuadsParser.java
index 7ca4fe9..3f5b5d2 100644
--- a/nquads/src/main/java/org/apache/any23/io/nquads/NQuadsParser.java
+++ b/nquads/src/main/java/org/apache/any23/io/nquads/NQuadsParser.java
@@ -17,27 +17,12 @@
 
 package org.apache.any23.io.nquads;
 
-import org.openrdf.model.BNode;
-import org.openrdf.model.Resource;
-import org.openrdf.model.Statement;
-import org.openrdf.model.URI;
-import org.openrdf.model.Value;
-import org.openrdf.model.datatypes.XMLDatatypeUtil;
-import org.openrdf.model.impl.URIImpl;
-import org.openrdf.rio.ParseLocationListener;
-import org.openrdf.rio.RDFFormat;
-import org.openrdf.rio.RDFHandler;
 import org.openrdf.rio.RDFHandlerException;
 import org.openrdf.rio.RDFParseException;
-import org.openrdf.rio.helpers.NTriplesParserSettings;
-import org.openrdf.rio.helpers.RDFParserBase;
-import org.openrdf.rio.ntriples.NTriplesUtil;
 
-import java.io.BufferedReader;
 import java.io.IOException;
 import java.io.InputStream;
 import java.io.InputStreamReader;
-import java.io.Reader;
 import java.nio.charset.Charset;
 
 /**
@@ -48,61 +33,7 @@ import java.nio.charset.Charset;
  * @author Michele Mostarda ([email protected])
  * @see org.openrdf.rio.RDFParser
  */
-public class NQuadsParser extends RDFParserBase {
-
-    /**
-     * Location listener acquired when parsing started.
-     */
-    private ParseLocationListener locationListener;
-
-    /**
-     * RDF handler acquired when parsing started.
-     */
-    private RDFHandler rdfHandler;
-
-    /**
-     * Current row, col and marker trackers.
-     */
-    private int row, col, mark;
-
-    public NQuadsParser() {}
-
-    public RDFFormat getRDFFormat() {
-        return RDFFormat.NQUADS;
-    }
-
-    public void parse(Reader reader, String baseURI)
-    throws IOException, RDFParseException, RDFHandlerException {
-        if(reader == null) {
-            throw new NullPointerException("reader cannot be null.");
-        }
-        if(baseURI == null) {
-            throw new NullPointerException("baseURI cannot be null.");
-        }
-        
-        try {
-            row = col = 1;
-
-            locationListener = getParseLocationListener();
-            rdfHandler = getRDFHandler();
-
-            setBaseURI(baseURI);
-
-            final BufferedReader br = new BufferedReader( reader );
-            if( rdfHandler != null ) {
-                rdfHandler.startRDF();
-            }
-            while( parseLine(br) ) {
-                nextRow();
-            }
-        } finally {
-            if(rdfHandler != null) {
-                rdfHandler.endRDF();
-            }
-            clear();
-            clearBNodeIDMap();
-        }
-    }
+public class NQuadsParser extends org.openrdf.rio.nquads.NQuadsParser {
 
     public synchronized void parse(InputStream is, String baseURI)
     throws IOException, RDFParseException, RDFHandlerException {
@@ -117,627 +48,4 @@ public class NQuadsParser extends RDFParserBase {
         this.parse(new InputStreamReader(is, Charset.forName("UTF-8")), 
baseURI);
     }
 
-    /**
-     * Moves to the next row, resets the column.
-     */
-    private void nextRow() {
-        col = 1;
-        row++;
-        if(locationListener != null) {
-            locationListener.parseLocationUpdate(row, col);
-        }
-    }
-
-    /**
-     * Moves to the next column.
-     */
-    private void nextCol() {
-        col++;
-        if(locationListener != null) {
-            locationListener.parseLocationUpdate(row, col);
-        }
-    }
-
-    /**
-     * Reads the next char.
-     *
-     * @param br
-     * @return the next read char.
-     * @throws IOException
-     */
-    private char readChar(BufferedReader br) throws IOException {
-        final int c = br.read();
-        if(c == -1) {
-            throw new EOS();
-        }
-        nextCol();
-        return (char) c;
-    }
-
-    /**
-     * Reads an unicode char with pattern <code>\\uABCD</code>.
-     *
-     * @param br input reader.
-     * @return read char.
-     * @throws IOException
-     * @throws RDFParseException
-     */
-    private char readUnicode(BufferedReader br) throws IOException, 
RDFParseException {
-        final char[] unicodeSequence = new char[4];
-        for(int i = 0; i < unicodeSequence.length; i++) {
-            unicodeSequence[i] = readChar(br);
-        }
-        final String unicodeCharStr = new String(unicodeSequence);
-        try {
-            return (char) Integer.parseInt(unicodeCharStr, 16);
-        } catch (NumberFormatException nfe) {
-            reportError("Error while converting unicode char '\\u" + 
unicodeCharStr + "'", row, col, 
NTriplesParserSettings.IGNORE_NTRIPLES_INVALID_LINES);
-            throw new IllegalStateException();
-        }
-    }
-
-    /**
-     * Marks the buffered input stream with the current location.
-     *
-     * @param br
-     */
-    private void mark(BufferedReader br) throws IOException {
-        mark = col;
-        br.mark(5);
-    }
-
-    /**
-     * Resets the buffered input stream and update the new location.
-     *
-     * @param br
-     * @throws IOException
-     */
-    private void reset(BufferedReader br) throws IOException {
-        col = mark;
-        br.reset();
-        if(locationListener != null) {
-            locationListener.parseLocationUpdate(row, col);
-        }
-    }
-
-    /**
-     * Asserts to read a specific char.
-     *
-     * @param br
-     * @param c
-     * @throws IOException
-     */
-    private void assertChar(BufferedReader br, char c) throws IOException, 
RDFParseException {
-        final char read = readChar(br);
-        if(read != c) {
-            throw new RDFParseException(
-                    String.format("Unexpected char '%s', expected '%s'", read, 
c),
-                    row, col
-            );
-        }
-    }
-
-    /**
-     * Consumes the reader until the next carriage return.
-     *
-     * @param br
-     * @throws IOException
-     */
-    private void consumeBrokenLine(BufferedReader br) throws IOException {
-        char c;
-        while (true) {
-            mark(br);
-            c = readChar(br);
-            if (c == '\n') {
-                return;
-            }
-        }
-    }
-
-    /**
-     * Parsers an <i>NQuads</i> line.
-     *
-     * @param br input stream reader containing NQuads.
-     * @return <code>false</code> if the parsing completed, <code>true</code> 
otherwise.
-     * @throws IOException
-     * @throws RDFParseException
-     * @throws RDFHandlerException
-     */
-    private boolean parseLine(BufferedReader br)
-    throws IOException, RDFParseException, RDFHandlerException {
-
-        if(!consumeSpacesAndNotEOS(br)) {
-            return false;
-        }
-
-        // Consumes empty line or line comment.
-        try {
-            if(consumeEmptyLine(br)) return true;
-            if( consumeComment(br) ) return true;
-        } catch (EOS eos) {
-            return false;
-        }
-
-        final Resource sub;
-        final URI      pred;
-        final Value    obj;
-        final URI      context;
-        try {
-            sub = parseSubject(br);
-            consumeSpaces(br);
-            pred = parsePredicate(br);
-            consumeSpaces(br);
-            obj = parseObject(br);
-            consumeSpaces(br);
-            context = parseContextAndOrDot(br);
-        } catch (EOS eos) {
-            reportFatalError("Unexpected end of stream.", row, col);
-            throw new IllegalStateException();
-        } catch (Exception e) { // FIXME: We should not be catching 
IOException here as that is used to indicate a failure of the stream/writer
-            if(super.stopAtFirstError()) {
-                if(e instanceof RDFParseException)
-                    throw (RDFParseException) e;
-                else
-                    throw new RDFParseException(e, row, col);
-            } else { // Remove rest of broken line and report error.
-                consumeBrokenLine(br);
-                reportError(e.getMessage(), row, col, 
NTriplesParserSettings.IGNORE_NTRIPLES_INVALID_LINES);
-                return true;
-            }
-        }
-
-        assert sub  != null : "Subject cannot be null.";
-        assert pred != null : "Predicate cannot be null.";
-        assert obj  != null : "Object cannot be null.";
-        notifyStatement(sub, pred, obj, context);
-
-        if(!consumeSpacesAndNotEOS(br)) {
-            return false;
-        }
-        return readChar(br) == '\n';
-    }
-
-    /**
-     * Consumes the line if empty (contains just a carriage return).
-     *
-     * @param br input NQuads stream.
-     * @return <code>true</code> if the line is empty.
-     * @throws IOException if an error occurs while consuming stream.
-     */
-    private boolean consumeEmptyLine(BufferedReader br) throws IOException {
-        char c;
-        mark(br);
-        c = readChar(br);
-        if (c == '\n') {
-            return true;
-        } else {
-            reset(br);
-            return false;
-        }
-    }
-
-    /**
-     * Consumes all subsequent spaces and returns true, if End Of Stream is 
reached instead returns false.
-     * @param br input NQuads stream reader.
-     * @return <code>true</code> if there are other chars to be consumed.
-     * @throws IOException if an error occurs while consuming stream.
-     */
-    private boolean consumeSpacesAndNotEOS(BufferedReader br) throws 
IOException {
-        try {
-            consumeSpaces(br);
-            return true;
-        } catch (EOS eos) {
-            return false;
-        }
-    }
-
-    /**
-     * Consumes a comment if any.
-     *
-     * @param br input NQuads stream reader.
-     * @return <code>true</code> if comment has been consumed, false otherwise.
-     * @throws IOException
-     */
-    private boolean consumeComment(BufferedReader br) throws IOException {
-        char c;
-        mark(br);
-        c = readChar(br);
-        if (c == '#') {
-            mark(br);
-            while (readChar(br) != '\n');
-            mark(br);
-            return true;
-        } else {
-            reset(br);
-            return false;
-        }
-    }
-
-    /**
-     * Notifies the parsed statement to the {@link RDFHandler}.
-     *
-     * @param sub
-     * @param pred
-     * @param obj
-     * @param context
-     * @throws RDFParseException
-     * @throws RDFHandlerException
-     */
-    private void notifyStatement(Resource sub, URI pred, Value obj, URI 
context)
-    throws RDFParseException, RDFHandlerException {
-        Statement statement = super.createStatement(sub, pred, obj, context);
-        if (rdfHandler != null) {
-            try {
-                rdfHandler.handleStatement(statement);
-            } catch (RDFHandlerException rdfhe) {
-                reportFatalError(rdfhe);
-                throw rdfhe;
-            }
-        }
-    }
-
-    /**
-     * Consumes spaces until a non space char is detected.
-     *
-     * @param br input stream reader from which consume spaces.
-     * @throws IOException
-     */
-    private void consumeSpaces(BufferedReader br) throws IOException {
-        char c;
-        while(true) {
-            mark(br);
-            c = readChar(br);
-            if(c == ' ' || c == '\r' || c == '\f' || c == '\t') {
-                mark(br);
-            } else {
-                break;
-            }
-        }
-        reset(br);
-    }
-
-    /**
-     * Consumes the dot at the end of NQuads line.
-     *
-     * @param br
-     * @throws IOException
-     */
-    private void parseDot(BufferedReader br) throws IOException, 
RDFParseException {
-        assertChar(br, '.');
-    }
-
-    /**
-     * Parses a URI enclosed within &lt; and &gt; brackets.
-     * @param br
-     * @return the parsed URI.
-     * @throws IOException
-     * @throws RDFParseException
-     */
-    private URI parseURI(BufferedReader br) throws IOException, 
RDFParseException {
-        assertChar(br, '<');
-
-        StringBuilder sb = new StringBuilder();
-        char c;
-        while(true) {
-            c = readChar(br);
-            if(c != '>') {
-                sb.append(c);
-            } else {
-                break;
-            }
-        }
-        mark(br);
-
-        try {
-            // TODO - LOW: used to unescape \\uXXXX unicode chars. Unify with 
#printEscaped().
-            String uriStr = NTriplesUtil.unescapeString( sb.toString() );
-            URI uri;
-            if(uriStr.charAt(0) == '#') {
-                uri = super.resolveURI(uriStr);
-            } else {
-                uri = super.createURI(uriStr);
-            }
-            return uri;
-        } catch (RDFParseException rdfpe) {
-            reportFatalError(rdfpe, row, col);
-            throw rdfpe;
-        }
-    }
-
-    /**
-     * Parses a BNode.
-     *
-     * @param br the buffered input stream.
-     * @return the generated bnode.
-     * @throws IOException
-     * @throws RDFParseException
-     */
-    private BNode parseBNode(BufferedReader br) throws IOException, 
RDFParseException {
-        assertChar(br, '_');
-        assertChar(br, ':');
-
-        char c;
-        StringBuilder sb = new StringBuilder();
-        while(true) {
-            c = readChar(br);
-            if(c != ' ' && c != '<') {
-                sb.append(c);
-                mark(br);
-            } else {
-                break;
-            }
-        }
-        reset(br);
-
-        try {
-            return createBNode( sb.toString() );
-        } catch (RDFParseException rdfpe) {
-            reportFatalError(rdfpe, row, col);
-            throw rdfpe;
-        }
-    }
-
-    /**
-     * Parses a literal attribute that can be either the language or the data 
type.
-     *
-     * @param br
-     * @return the literal attribute.
-     * @throws IOException
-     */
-    private LiteralAttribute parseLiteralAttribute(BufferedReader br) throws 
IOException, RDFParseException {
-        char c = readChar(br);
-        if(c != '^' && c != '@') {
-            reset(br);
-            return null;
-        }
-
-        boolean isLang = true;
-        if(c == '^') {
-            isLang = false;
-            assertChar(br, '^');
-        }
-
-        final String attribute;
-        if (isLang) { // Read until space or context begin.
-            final StringBuilder sb = new StringBuilder();
-            while (true) {
-                c = readChar(br);
-                if (c != ' ' && c != '<') {
-                    mark(br);
-                    sb.append(c);
-                } else {
-                    reset(br);
-                    break;
-                }
-            }
-            attribute = sb.toString();
-        }  else {
-            attribute = parseURI(br).stringValue();
-        }
-
-        return new LiteralAttribute(isLang, attribute);
-    }
-
-    /**
-     * Validates and normalize the value of a literal on the basis of the 
datat ype handling policy and
-     * the associated data type.
-     *
-     * @param value
-     * @param datatype
-     * @return the normalized data type. It depends on the data type handling 
policy and the specified data type.
-     * @throws RDFParseException
-     */
-    private String validateAndNormalizeLiteral(String value, URI datatype) 
throws RDFParseException {
-        DatatypeHandling dh = datatypeHandling();
-        if(dh.equals( DatatypeHandling.IGNORE )) {
-            return value;
-        }
-
-        if ( dh.equals(DatatypeHandling.VERIFY) ) {
-            if( ! XMLDatatypeUtil.isBuiltInDatatype(datatype)){
-                return value;
-            }
-            if( ! XMLDatatypeUtil.isValidValue(value, datatype) ) {
-                throw new RDFParseException(
-                        String.format("Illegal literal value '%s' with 
datatype %s", value, datatype.stringValue() ),
-                        row, col
-                );
-            }
-            return value;
-        } else if( dh.equals(DatatypeHandling.NORMALIZE) ) {
-            return XMLDatatypeUtil.normalize(value, datatype);
-        } else {
-            throw new IllegalArgumentException( String.format("Unsupported 
datatype handling: %s", dh) );
-        }
-    }
-
-    /**
-     * Prints the escaped version of the given char c.
-     *
-     * @param c escaped char.
-     * @param sb output string builder.
-     */
-    private void printEscaped(char c, StringBuilder sb) {
-        if(c == 'b') {
-            sb.append('\b');
-            return;
-        }
-        if(c == 'f') {
-            sb.append('\f');
-            return;
-        }
-        if(c == 'n') {
-            sb.append('\n');
-            return;
-        }
-        if(c == 'r') {
-            sb.append('\r');
-            return;
-        }
-        if(c == 't') {
-            sb.append('\t');
-            return;
-        }
-    }
-
-    /**
-     * Parses a literal.
-     *
-     * @param br
-     * @return the parsed literal.
-     * @throws IOException
-     * @throws RDFParseException
-     */
-    private Value parseLiteral(BufferedReader br) throws IOException, 
RDFParseException {
-        assertChar(br, '"');
-
-        char c;
-        boolean escaped = false;
-        StringBuilder sb = new StringBuilder();
-        while(true) {
-            c = readChar(br);
-            if( c == '\\' ) {
-                if(escaped) {
-                    escaped = false;
-                    sb.append(c);
-                } else {
-                    escaped = true;
-                }
-                continue;
-            } else if(c == '"' && !escaped) {
-                break;
-            }
-            if(escaped) {
-                if(c == 'u') {
-                    char unicodeChar = readUnicode(br);
-                    sb.append(unicodeChar);
-                } else {
-                    printEscaped(c, sb);
-                }
-                escaped = false;
-            } else {
-                sb.append(c);
-            }
-        }
-        mark(br);
-
-        LiteralAttribute lt = parseLiteralAttribute(br);
-
-        final String value = sb.toString();
-        if(lt == null) {
-            return createLiteral(value, null, null);
-        }else if(lt.isLang) {
-            return createLiteral(
-                    value,
-                    lt.value,
-                    null
-            );
-        } else {
-            URI literalType = null;
-            try {
-                literalType = new URIImpl(lt.value);
-            } catch (Exception e) {
-                reportError( String.format("Error while parsing literal type 
'%s'", lt.value), row, col , 
NTriplesParserSettings.IGNORE_NTRIPLES_INVALID_LINES);
-            }
-            return createLiteral(
-                    validateAndNormalizeLiteral(value, literalType),
-                    null,
-                    literalType
-            );
-        }
-    }
-
-    /**
-     * Parses the subject sequence.
-     *
-     * @param br
-     * @return the corresponding URI object.
-     * @throws IOException
-     * @throws RDFParseException
-     */
-    private Resource parseSubject(BufferedReader br) throws IOException, 
RDFParseException {
-        mark(br);
-        char c = readChar(br);
-        reset(br);
-        if( c == '<' ) {
-            return parseURI(br);
-        } else {
-            return parseBNode(br);
-        }
-    }
-
-    /**
-     * Parses the predicate URI.
-     *
-     * @param br
-     * @return the corresponding URI object.
-     * @throws IOException
-     * @throws RDFParseException
-     */
-    private URI parsePredicate(BufferedReader br) throws IOException, 
RDFParseException {
-        return parseURI(br);
-    }
-
-    /**
-     * Parses the the object sequence.
-     *
-     * @param br
-     * @return the corresponding URI object.
-     * @throws IOException
-     * @throws RDFParseException
-     */
-    private Value parseObject(BufferedReader br) throws IOException, 
RDFParseException {
-        mark(br);
-        char c = readChar(br);
-        reset(br);
-        if( c == '<' ) {
-            return parseURI(br);
-        } else if( c == '_') {
-            return parseBNode(br);
-        } else {
-            return parseLiteral(br);
-        }
-    }
-
-    /**
-     * Represents a literal with its attribute value that can be either a 
language or a data type.
-     */
-    class LiteralAttribute {
-        final boolean isLang;
-        final String value;
-
-        LiteralAttribute(boolean lang, String value) {
-            isLang = lang;
-            this.value = value;
-        }
-    }
-
-    /**
-     * Parses the context if any.
-     *
-     * @param br
-     * @return the context URI or null if not found.
-     * @throws IOException
-     * @throws RDFParseException
-     */
-    private URI parseContextAndOrDot(BufferedReader br) throws IOException, 
RDFParseException {
-        mark(br);
-        final char c = readChar(br);
-        reset(br);
-        if(c == '<') {
-            final URI context = parseURI(br);
-            consumeSpaces(br);
-            parseDot(br);
-            return context;
-        } else {
-            parseDot(br);
-            return null;
-        }
-    }
-
-    /**
-     * Defines the End Of Stream exception.
-     */
-    class EOS extends IOException {}
-
 }

Reply via email to