This is an automated email from the ASF dual-hosted git repository. andy pushed a commit to branch main in repository https://gitbox.apache.org/repos/asf/jena.git
commit 775204d811b7c51dc847f4f8b4003c4ff39a90f3 Author: Andy Seaborne <[email protected]> AuthorDate: Wed Dec 10 17:20:32 2025 +0000 GH-3634: Partial separation of TurtleJCC from RIOT --- jena-arq/Grammar/Turtle/turtle | 2 +- jena-arq/Grammar/Turtle/turtle.jj | 7 +- .../lang/turtlejcc/LangTurtleJCCParserBase.java | 272 +++++++++++++++++++++ .../apache/jena/riot/lang/turtlejcc/OutputRDF.java | 49 ++++ .../lang/turtlejcc/TurtleJavaccReaderRIOT.java | 54 +++- .../riot/lang/turtlejcc/javacc/ParseException.java | 2 +- .../lang/turtlejcc/javacc/SimpleCharStream.java | 2 +- .../jena/riot/lang/turtlejcc/javacc/Token.java | 2 +- .../riot/lang/turtlejcc/javacc/TokenMgrError.java | 2 +- .../riot/lang/turtlejcc/javacc/TurtleJavacc.java | 5 +- .../turtlejcc/javacc/TurtleJavaccTokenManager.java | 3 +- 11 files changed, 383 insertions(+), 17 deletions(-) diff --git a/jena-arq/Grammar/Turtle/turtle b/jena-arq/Grammar/Turtle/turtle index 7a38626fc0..78bae9060e 100755 --- a/jena-arq/Grammar/Turtle/turtle +++ b/jena-arq/Grammar/Turtle/turtle @@ -15,7 +15,7 @@ ## See the License for the specific language governing permissions and ## limitations under the License. -DIR="../../src/main/java/org/apache/jena/riot/lang/extra/javacc" +DIR="../../src/main/java/org/apache/jena/riot/lang/turtlejcc/javacc" FILE="turtle.jj" CLASS=TurtleJavacc diff --git a/jena-arq/Grammar/Turtle/turtle.jj b/jena-arq/Grammar/Turtle/turtle.jj index 70a2985820..8126b8a876 100644 --- a/jena-arq/Grammar/Turtle/turtle.jj +++ b/jena-arq/Grammar/Turtle/turtle.jj @@ -52,13 +52,12 @@ PARSER_BEGIN(TurtleJavacc) * limitations under the License. */ -package org.apache.jena.riot.lang.extra.javacc; +package org.apache.jena.riot.lang.turtlejcc.javacc; import org.apache.jena.graph.*; -import org.apache.jena.riot.lang.* ; -import static org.apache.jena.riot.lang.LangParserLib.*; +import org.apache.jena.riot.lang.turtlejcc.*; -public class TurtleJavacc extends LangParserBase +public class TurtleJavacc extends LangTurtleJCCParserBase {} PARSER_END(TurtleJavacc) diff --git a/jena-arq/src/main/java/org/apache/jena/riot/lang/turtlejcc/LangTurtleJCCParserBase.java b/jena-arq/src/main/java/org/apache/jena/riot/lang/turtlejcc/LangTurtleJCCParserBase.java new file mode 100644 index 0000000000..8e6dfb6960 --- /dev/null +++ b/jena-arq/src/main/java/org/apache/jena/riot/lang/turtlejcc/LangTurtleJCCParserBase.java @@ -0,0 +1,272 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.jena.riot.lang.turtlejcc; + +import org.apache.jena.atlas.AtlasException; +import org.apache.jena.atlas.lib.EscapeStr; +import org.apache.jena.datatypes.RDFDatatype; +import org.apache.jena.datatypes.TypeMapper; +import org.apache.jena.datatypes.xsd.XSDDatatype; +import org.apache.jena.graph.Node; +import org.apache.jena.graph.Triple; +import org.apache.jena.query.ARQ; +import org.apache.jena.riot.RiotException; +import org.apache.jena.riot.RiotParseException; +import org.apache.jena.riot.lang.LangParserLib; +import org.apache.jena.riot.system.ParserProfile; +import org.apache.jena.riot.system.RiotLib; +import org.apache.jena.sparql.core.Var; +import org.apache.jena.sparql.graph.NodeConst; +import org.apache.jena.vocabulary.RDF; + +// Isolated LangParserBase +public class LangTurtleJCCParserBase { + + protected final Node XSD_TRUE = NodeConst.nodeTrue ; + protected final Node XSD_FALSE = NodeConst.nodeFalse ; + + protected final Node nRDFtype = NodeConst.nodeRDFType ; + + protected final Node nRDFnil = NodeConst.nodeNil ; + protected final Node nRDFfirst = NodeConst.nodeFirst ; + protected final Node nRDFrest = NodeConst.nodeRest ; + + protected final Node nRDFsubject = RDF.Nodes.subject ; + protected final Node nRDFpredicate = RDF.Nodes.predicate ; + protected final Node nRDFobject = RDF.Nodes.object ; + + protected final Node nRDFreifies = RDF.Nodes.reifies; + + protected OutputRDF output; + protected ParserProfile profile; + + public LangTurtleJCCParserBase() { } + + // These are essential calls unless the parser takes over the functions. + // They can't easily be in the constructor because this class is inherited + // by the JavaCC generated parser. + public void setProfile(ParserProfile profile) { + this.profile = profile; + } + + public void setDest(OutputRDF output) { + this.output = output; + } + + // ---- From LangParserLib + /** Remove the first n characters from the string */ + public static String stripChars(String s, int n) { + return s.substring(n, s.length()) ; + } + + /** Remove first and last characters (e.g. ' or "") from a string */ + public static String stripQuotes(String s) { + return s.substring(1, s.length() - 1) ; + } + + /** Remove first 3 and last 3 characters (e.g. ''' or """) from a string */ + public static String stripQuotes3(String s) { + return s.substring(3, s.length() - 3) ; + } + + /** Unescape \t, \n etc.*/ + public static String unescapeStr(String s, int line, int column) + { return unescape(s, '\\', false, line, column) ; } + + // Worker function + private static String unescape(String s, char escape, boolean pointCodeOnly, int line, int column) { + try { + return EscapeStr.unescape(s, escape, pointCodeOnly) ; + } catch (AtlasException ex) { + throw new RiotParseException(ex.getMessage(), line, column) ; + } + } + // ---- From LangParserLib + + /** + * Standardize a prefix - prefixes do not include the ":". + */ + protected String canonicalPrefix(String prefix, int line, int column) { + if ( prefix.endsWith(":") ) + prefix = prefix.substring(0, prefix.length() - 1) ; + return prefix ; + } + + protected Node createURI(String iriStr, int line, int column) { + checkRDFString(iriStr, line, column); + return profile.createURI(iriStr, line, column); + } + + protected Node createBNode(int line, int column) { + return profile.createBlankNode(null, line, column); + } + + protected Node createBNode(String label, int line, int column) { + return profile.createBlankNode(null, label, line, column); + } + + protected Node createListNode(int line, int column) { + return createBNode(line, column); + } + + /** + * Apply any checks for "RDF String" to a string that has already had escape processing applied. + * An RDF String is a sequence of codepoints in the range U+0000 to U+10FFFF, excluding surrogates. + * Because this is java, we test for no non-paired surrogates. + * A surrogate pair is high-low. + */ + protected static void checkRDFString(String string, int line, int column) { + for ( int i = 0 ; i < string.length() ; i++ ) { + // Not "codePointAt" which does surrogate processing. + char ch = string.charAt(i); + + if ( ! Character.isValidCodePoint(ch) ) + throw new RiotParseException(String.format("Illegal code point in \\U sequence value: 0x%08X", ch), line, column); + + // Check surrogate pairs are pairs. + if ( Character.isHighSurrogate(ch) ) { + i++; + if ( i == string.length() ) + throw new RiotParseException("Bad surrogate pair (end of string)", line, column); + char ch1 = string.charAt(i); + if ( ! Character.isLowSurrogate(ch1) ) { + throw new RiotParseException("Bad surrogate pair (high surrogate not followed by low surrogate)", line, column); + } + } else if ( Character.isLowSurrogate(ch) ) { + throw new RiotParseException("Bad surrogate pair (low surrogate without high surrogate)", line, column); + } + } + } + + protected Node createLiteral(String lexicalForm, String langTag, String datatypeURI, int line, int column) { + Node n = null ; + // Can't have type and lang tag in parsing. + if ( datatypeURI != null ) { + RDFDatatype dType = TypeMapper.getInstance().getSafeTypeByName(datatypeURI) ; + n = profile.createTypedLiteral(lexicalForm, dType, line, column) ; + } else if ( langTag != null && !langTag.isEmpty() ) + n = profile.createLangLiteral(lexicalForm, langTag, line, column) ; + else + n = profile.createStringLiteral(lexicalForm, line, column) ; + return n ; + } + + protected Node createTripleTerm(Node s, Node p, Node o, int line, int column) { + return profile.createTripleTerm(s, p, o, line, column); + } + + protected Node createLiteralInteger(String lexicalForm, int line, int column) { + return profile.createTypedLiteral(lexicalForm, XSDDatatype.XSDinteger, line, column); + } + + protected Node createLiteralDecimal(String lexicalForm, int line, int column) { + return profile.createTypedLiteral(lexicalForm, XSDDatatype.XSDdecimal, line, column); + } + + protected Node createLiteralDouble(String lexicalForm, int line, int column) { + return profile.createTypedLiteral(lexicalForm, XSDDatatype.XSDdouble, line, column); + } + + protected Var createVariable(String varName, int line, int column) { + varName = varName.substring(1) ; // Drop the marker + return Var.alloc(varName) ; + } + + protected String resolvePName(String pname, int line, int column) { + int idx = pname.indexOf(':'); + String prefix = pname.substring(0, idx); + String localPart = pname.substring(idx+1); + localPart = LangParserLib.unescapePName(localPart, line, column); + String expansion = profile.getPrefixMap().expand(prefix, localPart); + if ( expansion == null ) { + if ( ARQ.isTrue(ARQ.fixupUndefinedPrefixes) ) + return RiotLib.fixupPrefixIRI(prefix, localPart); + profile.getErrorHandler().fatal("Undefined prefix: " + prefix, line, column); + } + return expansion; + } + + protected String resolveQuotedIRI(String iriStr, int line, int column) { + iriStr = LangParserLib.stripQuotes(iriStr); + iriStr = unescapeIRI(iriStr); + checkRDFString(iriStr, line, column); + // Check + if ( iriStr.contains("<") || iriStr.contains(">") ) + throw new RiotParseException("Illegal character '<' or '>' in IRI: '"+iriStr+"'", line, column); + return profile.resolveIRI(iriStr, line, column); + } + + protected void setBase(String iri, int line, int column) { + profile.setBaseIRI(iri); + output.base(iri); + } + + protected void setPrefix(String prefix, String iri, int line, int column) { + prefix = canonicalPrefix(prefix, line, column); + profile.getPrefixMap().add(prefix,iri); + output.prefix(prefix, iri); + } + + protected void declareVersion(String version, int line, int column) { + output.version(version); + } + + protected void emitTriple(int line, int column, Node s, Node p, Node o) { + output.triple(Triple.create(s, p, o)); + } + + protected Node emitTripleReifier(int line, int column, Node reifierId, Node s, Node p, Node o) { + Node tripleTerm = createTripleTerm(s, p, o, line, column); + if ( reifierId == null ) + reifierId = createBNode(line, column); + Triple reifiedTriple = Triple.create(reifierId, nRDFreifies, tripleTerm); + output.triple(reifiedTriple); + return reifierId; + } + + private Node annotationReifierId = null; + + protected void setReifierId(Node reifId) { + annotationReifierId = reifId; + } + + protected Node getOrAllocReifierId(Node s, Node p, Node o, int line, int column) { + if ( annotationReifierId != null ) + return annotationReifierId; + Node reifierId = createBNode(line, column); + emitTripleReifier(line, column, reifierId, s, p, o); + return reifierId; + } + + protected void clearReifierId() { + annotationReifierId = null; + } + + protected String unescapeIRI(String iriStr) { + try { + return EscapeStr.unescape(iriStr, '\\', true); + } catch (AtlasException ex) { + throw new RiotException(ex.getMessage()); + } + } + + protected void listStart(int line, int column) {}; + protected void listTriple(int line, int column, Node s, Node p , Node o) { emitTriple(line, column, s, p, o); } + protected void listFinish(int line, int column) {}; +} diff --git a/jena-arq/src/main/java/org/apache/jena/riot/lang/turtlejcc/OutputRDF.java b/jena-arq/src/main/java/org/apache/jena/riot/lang/turtlejcc/OutputRDF.java new file mode 100644 index 0000000000..0ab2ac86b7 --- /dev/null +++ b/jena-arq/src/main/java/org/apache/jena/riot/lang/turtlejcc/OutputRDF.java @@ -0,0 +1,49 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + * SPDX-License-Identifier: Apache-2.0 + */ + +package org.apache.jena.riot.lang.turtlejcc; + +import org.apache.jena.graph.Triple; +import org.apache.jena.sparql.core.Quad; + +/** The StreamRDF interface but extracted so it can be used in jena-core. */ + +public interface OutputRDF { + /** Start processing */ + public void start() ; + + /** Triple emitted */ + public void triple(Triple triple) ; + + /** Quad emitted */ + public void quad(Quad quad) ; + + /** base declaration seen */ + public void base(String base) ; + + /** prefix declaration seen */ + public void prefix(String prefix, String iri) ; + + /** version declaration seen */ + public void version(String version) ; + + /** Finish processing */ + public void finish() ; +} diff --git a/jena-arq/src/main/java/org/apache/jena/riot/lang/turtlejcc/TurtleJavaccReaderRIOT.java b/jena-arq/src/main/java/org/apache/jena/riot/lang/turtlejcc/TurtleJavaccReaderRIOT.java index 6965d9fdce..673315752f 100644 --- a/jena-arq/src/main/java/org/apache/jena/riot/lang/turtlejcc/TurtleJavaccReaderRIOT.java +++ b/jena-arq/src/main/java/org/apache/jena/riot/lang/turtlejcc/TurtleJavaccReaderRIOT.java @@ -23,6 +23,7 @@ import java.io.Reader; import org.apache.jena.atlas.io.IO; import org.apache.jena.atlas.web.ContentType; +import org.apache.jena.graph.Triple; import org.apache.jena.riot.ReaderRIOT; import org.apache.jena.riot.RiotParseException; import org.apache.jena.riot.lang.LangTurtle; @@ -31,6 +32,7 @@ import org.apache.jena.riot.lang.turtlejcc.javacc.TokenMgrError; import org.apache.jena.riot.lang.turtlejcc.javacc.TurtleJavacc; import org.apache.jena.riot.system.ParserProfile; import org.apache.jena.riot.system.StreamRDF; +import org.apache.jena.sparql.core.Quad; import org.apache.jena.sparql.util.Context; /** @@ -48,11 +50,11 @@ public class TurtleJavaccReaderRIOT implements ReaderRIOT { public TurtleJavaccReaderRIOT(ParserProfile profile) { this.profile = profile; } @Override - public void read(InputStream in, String baseURI, ContentType ct, StreamRDF output, Context context) { + public void read(InputStream in, String baseURI, ContentType ct, StreamRDF streamDest, Context context) { // Do bytes -> chars in big units. Reader r = IO.asBufferedUTF8(in); TurtleJavacc parser = new TurtleJavacc(r); - read(parser, baseURI, ct, output, context); + read(parser, baseURI, ct, streamDest, context); } @Override @@ -63,7 +65,8 @@ public class TurtleJavaccReaderRIOT implements ReaderRIOT { read(parser, baseURI, ct, output, context); } - private void read(TurtleJavacc parser, String baseURI, ContentType ct, StreamRDF output, Context context) { + private void read(TurtleJavacc parser, String baseURI, ContentType ct, StreamRDF streamDest, Context context) { + OutputRDF output = new StreamRDFJCC(streamDest); parser.setDest(output); parser.setProfile(profile); // profile should be setup correctly for the base @@ -81,4 +84,49 @@ public class TurtleJavaccReaderRIOT implements ReaderRIOT { throw new RiotParseException(ex.getMessage(), -1 , -1); } } + + // Could add OutputRDF to StreamRDF. + // This parser is not used for speed so keep OutputRDF local. + // OutputRDF is part of isolating the parser engine from RIOT. + static class StreamRDFJCC implements OutputRDF { + private final StreamRDF dest; + StreamRDFJCC(StreamRDF dest) { + this.dest = dest; + } + + @Override + public void start() { + dest.start(); + } + + @Override + public void triple(Triple triple) { + dest.triple(triple); + } + + @Override + public void quad(Quad quad) { + dest.quad(quad); + } + + @Override + public void base(String base) { + dest.base(base); + } + + @Override + public void prefix(String prefix, String iri) { + dest.prefix(prefix, iri); + } + + @Override + public void version(String version) { + dest.version(version); + } + + @Override + public void finish() { + dest.finish(); + } + } } diff --git a/jena-arq/src/main/java/org/apache/jena/riot/lang/turtlejcc/javacc/ParseException.java b/jena-arq/src/main/java/org/apache/jena/riot/lang/turtlejcc/javacc/ParseException.java index 7837b35726..dc7695a58b 100644 --- a/jena-arq/src/main/java/org/apache/jena/riot/lang/turtlejcc/javacc/ParseException.java +++ b/jena-arq/src/main/java/org/apache/jena/riot/lang/turtlejcc/javacc/ParseException.java @@ -210,4 +210,4 @@ public class ParseException extends Exception { } } -/* JavaCC - OriginalChecksum=cd2fabc6c8379d5b5b624c8633603a52 (do not edit this line) */ +/* JavaCC - OriginalChecksum=29e7bb342c6cf3e245929b9fd8149bac (do not edit this line) */ diff --git a/jena-arq/src/main/java/org/apache/jena/riot/lang/turtlejcc/javacc/SimpleCharStream.java b/jena-arq/src/main/java/org/apache/jena/riot/lang/turtlejcc/javacc/SimpleCharStream.java index b0453cc3b0..deffc6cbf8 100644 --- a/jena-arq/src/main/java/org/apache/jena/riot/lang/turtlejcc/javacc/SimpleCharStream.java +++ b/jena-arq/src/main/java/org/apache/jena/riot/lang/turtlejcc/javacc/SimpleCharStream.java @@ -487,4 +487,4 @@ public class SimpleCharStream boolean getTrackLineColumn() { return trackLineColumn; } void setTrackLineColumn(boolean tlc) { trackLineColumn = tlc; } } -/* JavaCC - OriginalChecksum=40a887a9b16451c5e7c386e3fd5fddf5 (do not edit this line) */ +/* JavaCC - OriginalChecksum=32186373cb0e81e8631f76d36a3caa8c (do not edit this line) */ diff --git a/jena-arq/src/main/java/org/apache/jena/riot/lang/turtlejcc/javacc/Token.java b/jena-arq/src/main/java/org/apache/jena/riot/lang/turtlejcc/javacc/Token.java index 1e1826dee9..f7d0c9be7a 100644 --- a/jena-arq/src/main/java/org/apache/jena/riot/lang/turtlejcc/javacc/Token.java +++ b/jena-arq/src/main/java/org/apache/jena/riot/lang/turtlejcc/javacc/Token.java @@ -147,4 +147,4 @@ public class Token implements java.io.Serializable { } } -/* JavaCC - OriginalChecksum=9b3baee77fab9edf5314ae6d1172406e (do not edit this line) */ +/* JavaCC - OriginalChecksum=86f7e20338a37f70eadbb3337db56872 (do not edit this line) */ diff --git a/jena-arq/src/main/java/org/apache/jena/riot/lang/turtlejcc/javacc/TokenMgrError.java b/jena-arq/src/main/java/org/apache/jena/riot/lang/turtlejcc/javacc/TokenMgrError.java index 55446626e8..412bb8926b 100644 --- a/jena-arq/src/main/java/org/apache/jena/riot/lang/turtlejcc/javacc/TokenMgrError.java +++ b/jena-arq/src/main/java/org/apache/jena/riot/lang/turtlejcc/javacc/TokenMgrError.java @@ -164,4 +164,4 @@ public class TokenMgrError extends Error this(LexicalErr(EOFSeen, lexState, errorLine, errorColumn, errorAfter, curChar), reason); } } -/* JavaCC - OriginalChecksum=d9be68a0e6bddc4b5d4d52579b110992 (do not edit this line) */ +/* JavaCC - OriginalChecksum=568f4fa9a2786005de0bec225a66eeae (do not edit this line) */ diff --git a/jena-arq/src/main/java/org/apache/jena/riot/lang/turtlejcc/javacc/TurtleJavacc.java b/jena-arq/src/main/java/org/apache/jena/riot/lang/turtlejcc/javacc/TurtleJavacc.java index c45b3b9888..6c4537e881 100644 --- a/jena-arq/src/main/java/org/apache/jena/riot/lang/turtlejcc/javacc/TurtleJavacc.java +++ b/jena-arq/src/main/java/org/apache/jena/riot/lang/turtlejcc/javacc/TurtleJavacc.java @@ -21,10 +21,9 @@ package org.apache.jena.riot.lang.turtlejcc.javacc; import org.apache.jena.graph.*; -import org.apache.jena.riot.lang.* ; -import static org.apache.jena.riot.lang.LangParserLib.*; +import org.apache.jena.riot.lang.turtlejcc.*; -public class TurtleJavacc extends LangParserBase implements TurtleJavaccConstants { +public class TurtleJavacc extends LangTurtleJCCParserBase implements TurtleJavaccConstants { // Entry point final public void parse() throws ParseException { diff --git a/jena-arq/src/main/java/org/apache/jena/riot/lang/turtlejcc/javacc/TurtleJavaccTokenManager.java b/jena-arq/src/main/java/org/apache/jena/riot/lang/turtlejcc/javacc/TurtleJavaccTokenManager.java index d708810930..2b5ba0a61b 100644 --- a/jena-arq/src/main/java/org/apache/jena/riot/lang/turtlejcc/javacc/TurtleJavaccTokenManager.java +++ b/jena-arq/src/main/java/org/apache/jena/riot/lang/turtlejcc/javacc/TurtleJavaccTokenManager.java @@ -20,8 +20,7 @@ package org.apache.jena.riot.lang.turtlejcc.javacc; import org.apache.jena.graph.*; -import org.apache.jena.riot.lang.* ; -import static org.apache.jena.riot.lang.LangParserLib.*; +import org.apache.jena.riot.lang.turtlejcc.*; /** Token Manager. */ @SuppressWarnings ("unused")
