This is an automated email from the ASF dual-hosted git repository.
andy pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/jena.git
The following commit(s) were added to refs/heads/main by this push:
new 2c9782c6d7 GH-2740: Faster parsing of RDF/XML
2c9782c6d7 is described below
commit 2c9782c6d7647cf29124ca281887f81a83fb7e24
Author: arne-bdt <[email protected]>
AuthorDate: Sat Sep 28 22:14:56 2024 +0200
GH-2740: Faster parsing of RDF/XML
Parsers: RRX.RDFXML_SAX, RRX.RDFXML_StAX_ev, RRX.RDFXML_StAX_sr
- added "public Node createURI(IRIx iriX, ...);" to the ParserProfile,
which simply uses the given IRI instead of resolving it again.
- adding general IRIx caching (org.apache.jena.atlas.lib.cache.CacheSimple)
in the parsers
where the already cached
org.apache.jena.riot.system.ParserProfileStd#resolver is not applicable
- removed unused code and variables from ParserRRX_StAX_SR and
ParserRRX_StAX_EV
- added `org.apache.jena.riot.lang.rdfxml.TestXMLParser` in
jena-benchmarks-jmh
---
.../org/apache/jena/riot/lang/rdfxml/SysRRX.java | 15 ++-
.../jena/riot/lang/rdfxml/rrx/ParserRRX_SAX.java | 78 ++++++++---
.../lang/rdfxml/rrx_stax_ev/ParserRRX_StAX_EV.java | 115 +++++++++--------
.../lang/rdfxml/rrx_stax_sr/ParserRRX_StAX_SR.java | 118 ++++++++---------
.../org/apache/jena/riot/system/ParserProfile.java | 6 +-
.../apache/jena/riot/system/ParserProfileStd.java | 5 +
.../jena/riot/system/ParserProfileWrapper.java | 6 +
.../java/org/apache/jena/system/TestReadXML.java | 17 ++-
.../jena/riot/lang/rdfxml/TestXMLParser.java | 143 +++++++++++++++++++++
.../java/org/apache/jena/util/JenaXMLInput.java | 5 +-
10 files changed, 370 insertions(+), 138 deletions(-)
diff --git
a/jena-arq/src/main/java/org/apache/jena/riot/lang/rdfxml/SysRRX.java
b/jena-arq/src/main/java/org/apache/jena/riot/lang/rdfxml/SysRRX.java
index 17a6173b26..90deea6e18 100644
--- a/jena-arq/src/main/java/org/apache/jena/riot/lang/rdfxml/SysRRX.java
+++ b/jena-arq/src/main/java/org/apache/jena/riot/lang/rdfxml/SysRRX.java
@@ -29,8 +29,21 @@ import org.apache.jena.util.JenaXMLInput;
*/
public class SysRRX {
+ /**
+ * Creates and initializes a
javax.xml.stream.XMLInputFactory#newInstance().
+ * @return XMLInputFactory
+ */
public static XMLInputFactory createXMLInputFactory() {
- XMLInputFactory xmlInputFactory = XMLInputFactory.newInstance();
+ return initAndConfigure(XMLInputFactory.newInstance());
+ }
+
+ /**
+ * Configures the parser to be safe and sets necessary properties.
+ * This method should be called when a factory other than
+ * javax.xml.stream.XMLInputFactory#newInstance() is used.
+ * @param xmlInputFactory
+ */
+ public static <E extends XMLInputFactory> E initAndConfigure(final E
xmlInputFactory) {
JenaXMLInput.initXMLInputFactory(xmlInputFactory);
// Additional features. Enable character entity support.
xmlInputFactory.setProperty(XMLInputFactory.SUPPORT_DTD, Boolean.TRUE);
diff --git
a/jena-arq/src/main/java/org/apache/jena/riot/lang/rdfxml/rrx/ParserRRX_SAX.java
b/jena-arq/src/main/java/org/apache/jena/riot/lang/rdfxml/rrx/ParserRRX_SAX.java
index d6aa5c68e1..30daf48700 100644
---
a/jena-arq/src/main/java/org/apache/jena/riot/lang/rdfxml/rrx/ParserRRX_SAX.java
+++
b/jena-arq/src/main/java/org/apache/jena/riot/lang/rdfxml/rrx/ParserRRX_SAX.java
@@ -29,6 +29,8 @@ import javax.xml.namespace.QName;
import org.apache.commons.lang3.StringUtils;
import org.apache.jena.atlas.io.IndentedWriter;
+import org.apache.jena.atlas.lib.Cache;
+import org.apache.jena.atlas.lib.CacheFactory;
import org.apache.jena.atlas.lib.EscapeStr;
import org.apache.jena.datatypes.RDFDatatype;
import org.apache.jena.datatypes.xsd.impl.XMLLiteralType;
@@ -61,6 +63,7 @@ class ParserRRX_SAX
LexicalHandler,
DeclHandler,
EntityResolver2 {
+ private static int IRI_CACHE_SIZE = 8192;
private static boolean VERBOSE = false;
// Addition tracing for SAX events we don't care about.
private static boolean EVENTS = false;
@@ -299,8 +302,8 @@ class ParserRRX_SAX
Counter containerPropertyCounter,
NodeHolder collectionNode,
Emitter emitter,
- ParserMode parserMode
- ) {}
+ ParserMode parserMode,
+ Cache<String, IRIx> iriCache) {}
private Deque<ParserFrame> parserStack = new ArrayDeque<>();
@@ -320,7 +323,8 @@ class ParserRRX_SAX
containerPropertyCounter,
collectionNode,
currentEmitter,
- frameParserMode);
+ frameParserMode,
+ currentIriCache);
parserStack.push(frame);
}
@@ -331,8 +335,10 @@ class ParserRRX_SAX
trace.printf("Pop frame: S: %s -> %s : P: %s -> %s\n",
str(currentSubject), frame.subject,
str(currentProperty), frame.property);
}
-
- this.currentBase = frame.base;
+ if(isDifferentFromCurrentBase(frame.base)) {
+ this.currentBase = frame.base;
+ this.currentIriCache = frame.iriCache;
+ }
this.currentLang = frame.lang;
this.currentSubject = frame.subject;
this.currentProperty = frame.property;
@@ -382,6 +388,32 @@ class ParserRRX_SAX
private final String initialXmlBase;
private final String initialXmlLang;
private final StreamRDF destination;
+ private Cache<String, IRIx> iriCacheForBaseNull = null;
+ private Cache<String, IRIx> currentIriCache = null;
+ private final Map<IRIx, Cache<String, IRIx>> mapBaseIriToCache = new
HashMap<>();
+
+ private void updateCurrentIriCacheForCurrentBase() {
+ if(currentBase != null) {
+ currentIriCache = mapBaseIriToCache
+ .computeIfAbsent(currentBase,
+ b -> CacheFactory.createSimpleCache(IRI_CACHE_SIZE)
+ );
+ } else {
+ if(iriCacheForBaseNull == null) {
+ iriCacheForBaseNull =
CacheFactory.createSimpleCache(IRI_CACHE_SIZE);
+ }
+ currentIriCache = iriCacheForBaseNull;
+ }
+ }
+
+ private boolean isDifferentFromCurrentBase(IRIx base) {
+ if(currentBase != null) {
+ return !currentBase.equals(base);
+ } else if(base == null) {
+ return false;
+ }
+ return true;
+ }
// Tracking for ID on nodes (not reification usage)
// We limit the number of local fragment IDs tracked because map only
grows.
@@ -476,6 +508,7 @@ class ParserRRX_SAX
} else {
this.currentBase = null;
}
+ updateCurrentIriCacheForCurrentBase();
this.currentLang = "";
this.destination = destination;
}
@@ -655,7 +688,11 @@ class ParserRRX_SAX
String xmlBaseURI = attributes.getValue(xmlNS, xmlBaseLN);
if ( xmlBaseURI != null ) {
emitBase(xmlBaseURI, position);
- currentBase = resolveIRIx(xmlBaseURI, position);
+ var newBase = resolveIRIx(xmlBaseURI, position);
+ if(!newBase.equals(currentBase)) {
+ currentBase = newBase;
+ updateCurrentIriCacheForCurrentBase();
+ }
}
for ( int i = 0 ; i < attributes.getLength() ; i++ ) {
@@ -962,8 +999,9 @@ class ParserRRX_SAX
if ( xmlLang != null )
trace.printf("+ LANG @%s\n", xmlLang);
}
- if ( xmlBase != null ) {
+ if ( xmlBase != null && !xmlBase.equals(currentBase)) {
currentBase = xmlBase;// resolve.
+ updateCurrentIriCacheForCurrentBase();
}
if ( xmlLang != null )
@@ -1364,11 +1402,9 @@ class ParserRRX_SAX
private Node iriResolve(String uriStr, Position position) {
Objects.requireNonNull(uriStr);
Objects.requireNonNull(position);
- if ( uriStr.startsWith("_:") )
- // <_:label> syntax. Handled by the FactoryRDF via the parser
profile.
- return createURI(uriStr, position);
- String resolved = resolveIRIx(uriStr, position).str();
- return createURI(resolved, position);
+ return uriStr.startsWith("_:")
+ ? createURI(uriStr, position) // <_:label> syntax. Handled by
the FactoryRDF via the parser profile.
+ : createURI(resolveIRIx(uriStr, position), position);
}
private IRIx resolveIRIx(String uriStr, Position position) {
@@ -1386,10 +1422,13 @@ class ParserRRX_SAX
/** String to IRIx, no opinion */
private IRIx resolveIRIxAny(String uriStr, Position position) {
try {
- IRIx iri = ( currentBase != null )
- ? currentBase.resolve(uriStr)
- : IRIx.create(uriStr);
- return iri;
+ return currentIriCache.get(uriStr, uri -> {
+ if( currentBase != null ) {
+ return currentBase.resolve(uri);
+ } else {
+ return IRIx.create(uriStr);
+ }
+ });
} catch (IRIException ex) {
throw RDFXMLparseError(ex.getMessage(), position);
}
@@ -1403,6 +1442,13 @@ class ParserRRX_SAX
return parserProfile.createURI(iriStr, line, col);
}
+ private Node createURI(IRIx iriX, Position position) {
+ int line = position.line();
+ int col = position.column();
+ // Checking
+ return parserProfile.createURI(iriX, line, col);
+ }
+
private Node blankNode(Position position) {
Objects.requireNonNull(position);
int line = position.line();
diff --git
a/jena-arq/src/main/java/org/apache/jena/riot/lang/rdfxml/rrx_stax_ev/ParserRRX_StAX_EV.java
b/jena-arq/src/main/java/org/apache/jena/riot/lang/rdfxml/rrx_stax_ev/ParserRRX_StAX_EV.java
index eaefd06559..e30fe31b77 100644
---
a/jena-arq/src/main/java/org/apache/jena/riot/lang/rdfxml/rrx_stax_ev/ParserRRX_StAX_EV.java
+++
b/jena-arq/src/main/java/org/apache/jena/riot/lang/rdfxml/rrx_stax_ev/ParserRRX_StAX_EV.java
@@ -31,6 +31,8 @@ import javax.xml.stream.events.*;
import org.apache.commons.lang3.StringUtils;
import org.apache.jena.atlas.io.IndentedWriter;
+import org.apache.jena.atlas.lib.Cache;
+import org.apache.jena.atlas.lib.CacheFactory;
import org.apache.jena.atlas.lib.EscapeStr;
import org.apache.jena.datatypes.RDFDatatype;
import org.apache.jena.datatypes.xsd.impl.XMLLiteralType;
@@ -51,44 +53,70 @@ import org.apache.jena.vocabulary.RDF;
/** StAX events */
class ParserRRX_StAX_EV {
+ private static int IRI_CACHE_SIZE = 8192;
private static boolean EVENTS = false;
private final IndentedWriter trace;
private final XMLEventReader xmlEventReader;
+
+ private Cache<String, IRIx> iriCacheForBaseNull = null;
+ private Cache<String, IRIx> currentIriCache = null;
+ private final Map<IRIx, Cache<String, IRIx>> mapBaseIriToCache = new
HashMap<>();
// Stacks.
// Constants
- private static final String XML_PREFIX = "xml";
private static final String rdfNS = RDF.uri;
private static final String xmlNS = "http://www.w3.org/XML/1998/namespace";
- private static final String ID = "ID";
- private static final String NODE_ID = "nodeID";
- private static final String ABOUT = "about";
- private int blankNodeCounter = 0 ;
private boolean hasRDF = false;
private final ParserProfile parserProfile;
private final ErrorHandler errorHandler;
- private final Context context;
- private final String initialXmlBase;
- private final String initialXmlLang;
private final StreamRDF destination;
- private record BaseLang(IRIx base, String lang) {}
+ private void updateCurrentIriCacheForCurrentBase() {
+ if(currentBase != null) {
+ currentIriCache = mapBaseIriToCache
+ .computeIfAbsent(currentBase,
+ b -> CacheFactory.createSimpleCache(IRI_CACHE_SIZE)
+ );
+ } else {
+ if(iriCacheForBaseNull == null) {
+ iriCacheForBaseNull =
CacheFactory.createSimpleCache(IRI_CACHE_SIZE);
+ }
+ currentIriCache = iriCacheForBaseNull;
+ }
+ }
+
+ private boolean isDifferentFromCurrentBase(IRIx base) {
+ if(currentBase != null) {
+ return !currentBase.equals(base);
+ } else if(base == null) {
+ return false;
+ }
+ return true;
+ }
+
+ private record BaseLang(IRIx base, String lang, Cache<String, IRIx>
iriCache) {}
private Deque<BaseLang> stack = new ArrayDeque<>();
// Just these operations:
private void pushFrame(IRIx base, String lang) {
- BaseLang frame = new BaseLang(currentBase, currentLang);
+ BaseLang frame = new BaseLang(currentBase, currentLang,
currentIriCache);
stack.push(frame);
- currentBase = base;
currentLang = lang;
+ if(isDifferentFromCurrentBase(base)) {
+ currentBase = base;
+ updateCurrentIriCacheForCurrentBase();
+ }
}
private void popFrame() {
BaseLang frame = stack.pop();
- currentBase = frame.base;
currentLang = frame.lang;
+ if(isDifferentFromCurrentBase(frame.base)) {
+ currentBase = frame.base;
+ currentIriCache = frame.iriCache;
+ }
}
/** Mark the usage of a QName */
@@ -165,16 +193,14 @@ class ParserRRX_StAX_EV {
this.xmlEventReader = reader;
this.parserProfile = parserProfile;
- this.context = context;
this.errorHandler = parserProfile.getErrorHandler();
- this.initialXmlBase = xmlBase;
- this.initialXmlLang = "";
if ( xmlBase != null ) {
this.currentBase = IRIx.create(xmlBase);
- //parserProfile.setBaseIRI(currentBase.str());
+ parserProfile.setBaseIRI(currentBase.str());
} else {
this.currentBase = null;
}
+ updateCurrentIriCacheForCurrentBase();
this.currentLang = "";
this.destination = destination;
}
@@ -186,10 +212,6 @@ class ParserRRX_StAX_EV {
private static final QName rdfAbout = new QName(rdfNS, "about");
private static final QName rdfType = new QName(rdfNS, "type");
- private static final QName rdfSeq = new QName(rdfNS, "Seq");
- private static final QName rdfBag = new QName(rdfNS, "Bag");
- private static final QName rdfAlt = new QName(rdfNS, "Alt");
-
private static final QName rdfContainerItem = new QName(rdfNS, "li");
private static final QName rdfDatatype = new QName(rdfNS, "datatype");
private static final QName rdfParseType = new QName(rdfNS, "parseType");
@@ -985,8 +1007,9 @@ class ParserRRX_StAX_EV {
}
// Not seen this prefix or it was a different value.
- if ( ! namespaces.containsKey(prefix) ||
- ( namespaceURI != null && !
namespaces.get(prefix).equals(namespaceURI)) ) {
+ if ( namespaceURI != "" && // this first condition is needed for
woodstox and allto to work
+ (! namespaces.containsKey(prefix) ||
+ ( namespaceURI != null && !
namespaces.get(prefix).equals(namespaceURI)) )) {
// Define in current XML subtree.
outputNS.put(prefix, namespaceURI);
namespaces.put(prefix, namespaceURI);
@@ -1170,12 +1193,6 @@ class ParserRRX_StAX_EV {
// ---- Nodes
- private void setBase(String uriStr, Location location) {
- // Resolves
- Node n = iriResolve(uriStr, location);
- parserProfile.setBaseIRI(n.getURI());
- }
-
private Node qNameToIRI(QName qName, QNameUsage usage, Location location) {
if ( StringUtils.isBlank(qName.getNamespaceURI()) )
throw RDFXMLparseError("Unqualified "+usage.msg+" not allowed:
<"+qName.getLocalPart()+">", location);
@@ -1375,11 +1392,8 @@ class ParserRRX_StAX_EV {
}
boolean hasFrame = (xmlBase != null || xmlLang != null);
if ( hasFrame ) {
- pushFrame(currentBase, currentLang);
- if ( xmlBase != null )
- currentBase = xmlBase;
- if ( xmlLang != null )
- currentLang = xmlLang;
+ pushFrame(xmlBase != null ? xmlBase : currentBase,
+ xmlLang != null ? xmlLang : currentLang);
}
return hasFrame;
}
@@ -1475,18 +1489,11 @@ class ParserRRX_StAX_EV {
private Node iriResolve(String uriStr, Location location) {
Objects.requireNonNull(uriStr);
Objects.requireNonNull(location);
- int line = location.getLineNumber();
- int col = location.getColumnNumber();
- String resolved = resolveIRI(uriStr, location);
- return parserProfile.createURI(resolved, line, col);
- }
-
- /** Resolve an IRI. */
- private String resolveIRI(String uriStr, Location location) {
- if ( uriStr.startsWith("_:") )
- // <_:label> syntax. Handled by the FactoryRDF via the parser
profile.
- return uriStr;
- return resolveIRIx(uriStr, location).str();
+ final int line = location.getLineNumber();
+ final int col = location.getColumnNumber();
+ return uriStr.startsWith("_:")
+ ? parserProfile.createURI(uriStr, line, col) // <_:label>
syntax. Handled by the FactoryRDF via the parser profile.
+ : parserProfile.createURI(resolveIRIx(uriStr, location),
line, col);
}
private IRIx resolveIRIx(String uriStr, Location location) {
@@ -1503,10 +1510,13 @@ class ParserRRX_StAX_EV {
private IRIx resolveIRIxNoWarning(String uriStr, Location location) {
try {
- IRIx iri = ( currentBase != null )
- ? currentBase.resolve(uriStr)
- : IRIx.create(uriStr);
- return iri;
+ return currentIriCache.get(uriStr, uri -> {
+ if( currentBase != null ) {
+ return currentBase.resolve(uri);
+ } else {
+ return IRIx.create(uriStr);
+ }
+ });
} catch (IRIException ex) {
throw RDFXMLparseError(ex.getMessage(), location);
}
@@ -1528,13 +1538,6 @@ class ParserRRX_StAX_EV {
return parserProfile.createBlankNode(null, label, line, col);
}
-// private Node literal(String lex, String datatype, String lang, Location
location) {
-// int line = location.getLineNumber();
-// int col = location.getColumnNumber();
-// return parserProfile.createL
-// }
- // literal(lex, datatype, lang)
-
private Node literal(String lexical, Location location) {
Objects.requireNonNull(lexical);
Objects.requireNonNull(location);
diff --git
a/jena-arq/src/main/java/org/apache/jena/riot/lang/rdfxml/rrx_stax_sr/ParserRRX_StAX_SR.java
b/jena-arq/src/main/java/org/apache/jena/riot/lang/rdfxml/rrx_stax_sr/ParserRRX_StAX_SR.java
index 9f11b5954f..f021a8e1c4 100644
---
a/jena-arq/src/main/java/org/apache/jena/riot/lang/rdfxml/rrx_stax_sr/ParserRRX_StAX_SR.java
+++
b/jena-arq/src/main/java/org/apache/jena/riot/lang/rdfxml/rrx_stax_sr/ParserRRX_StAX_SR.java
@@ -33,6 +33,8 @@ import javax.xml.stream.events.XMLEvent;
import org.apache.commons.lang3.StringUtils;
import org.apache.jena.atlas.io.IndentedWriter;
+import org.apache.jena.atlas.lib.Cache;
+import org.apache.jena.atlas.lib.CacheFactory;
import org.apache.jena.atlas.lib.EscapeStr;
import org.apache.jena.datatypes.RDFDatatype;
import org.apache.jena.datatypes.xsd.impl.XMLLiteralType;
@@ -54,44 +56,70 @@ import org.apache.jena.vocabulary.RDF.Nodes;
/* StAX - stream reader */
class ParserRRX_StAX_SR {
+ private static int IRI_CACHE_SIZE = 8192;
private static boolean EVENTS = false;
private final IndentedWriter trace;
private final XMLStreamReader xmlSource;
+
+ private Cache<String, IRIx> iriCacheForBaseNull = null;
+ private Cache<String, IRIx> currentIriCache = null;
+ private final Map<IRIx, Cache<String, IRIx>> mapBaseIriToCache = new
HashMap<>();
// Stacks.
// Constants
- private static final String XML_PREFIX = "xml";
private static final String rdfNS = RDF.uri;
private static final String xmlNS = "http://www.w3.org/XML/1998/namespace";
- private static final String ID = "ID";
- private static final String NODE_ID = "nodeID";
- private static final String ABOUT = "about";
- private int blankNodeCounter = 0 ;
private boolean hasRDF = false;
private final ParserProfile parserProfile;
private final ErrorHandler errorHandler;
- private final Context context;
- private final String initialXmlBase;
- private final String initialXmlLang;
private final StreamRDF destination;
- private record BaseLang(IRIx base, String lang) {}
+ private void updateCurrentIriCacheForCurrentBase() {
+ if(currentBase != null) {
+ currentIriCache = mapBaseIriToCache
+ .computeIfAbsent(currentBase,
+ b -> CacheFactory.createSimpleCache(IRI_CACHE_SIZE)
+ );
+ } else {
+ if(iriCacheForBaseNull == null) {
+ iriCacheForBaseNull =
CacheFactory.createSimpleCache(IRI_CACHE_SIZE);
+ }
+ currentIriCache = iriCacheForBaseNull;
+ }
+ }
+
+ private boolean isDifferentFromCurrentBase(IRIx base) {
+ if(currentBase != null) {
+ return !currentBase.equals(base);
+ } else if(base == null) {
+ return false;
+ }
+ return true;
+ }
+
+ private record BaseLang(IRIx base, String lang, Cache<String, IRIx>
iriCache) {}
private Deque<BaseLang> stack = new ArrayDeque<>();
// Just these operations:
private void pushFrame(IRIx base, String lang) {
- BaseLang frame = new BaseLang(currentBase, currentLang);
+ BaseLang frame = new BaseLang(currentBase, currentLang,
currentIriCache);
stack.push(frame);
- currentBase = base;
currentLang = lang;
+ if(isDifferentFromCurrentBase(base)) {
+ currentBase = base;
+ updateCurrentIriCacheForCurrentBase();
+ }
}
private void popFrame() {
BaseLang frame = stack.pop();
- currentBase = frame.base;
currentLang = frame.lang;
+ if(isDifferentFromCurrentBase(frame.base)) {
+ currentBase = frame.base;
+ currentIriCache = frame.iriCache;
+ }
}
/** Mark the usage of a QName */
@@ -126,6 +154,7 @@ class ParserRRX_StAX_SR {
else
errorHandler.warning(message, -1, -1);
}
+
// Tracking for ID on nodes (not reification usage)
// We limit the number of local fragment IDs tracked because map only
grows.
// A base URI may be re-introduced so this isn't nested scoping.
@@ -165,16 +194,14 @@ class ParserRRX_StAX_SR {
this.xmlSource = reader;
this.parserProfile = parserProfile;
- this.context = context;
this.errorHandler = parserProfile.getErrorHandler();
- this.initialXmlBase = xmlBase;
- this.initialXmlLang = "";
if ( xmlBase != null ) {
this.currentBase = IRIx.create(xmlBase);
parserProfile.setBaseIRI(currentBase.str());
} else {
this.currentBase = null;
}
+ updateCurrentIriCacheForCurrentBase();
this.currentLang = "";
this.destination = destination;
}
@@ -186,10 +213,6 @@ class ParserRRX_StAX_SR {
private static final QName rdfAbout = new QName(rdfNS, "about");
private static final QName rdfType = new QName(rdfNS, "type");
- private static final QName rdfSeq = new QName(rdfNS, "Seq");
- private static final QName rdfBag = new QName(rdfNS, "Bag");
- private static final QName rdfAlt = new QName(rdfNS, "Alt");
-
private static final QName rdfContainerItem = new QName(rdfNS, "li");
private static final QName rdfDatatype = new QName(rdfNS, "datatype");
private static final QName rdfParseType = new QName(rdfNS, "parseType");
@@ -239,7 +262,6 @@ class ParserRRX_StAX_SR {
return $coreSyntaxTerms.contains(qName);
}
-
// 6.2.5 Production nodeElementURIs
// anyURI - ( coreSyntaxTerms | rdf:li | oldTerms )
private static boolean allowedNodeElementURIs(QName qName) {
@@ -952,8 +974,9 @@ class ParserRRX_StAX_SR {
}
// Not seen this prefix or it was a different value.
- if ( ! namespaces.containsKey(prefix) ||
- ( namespaceURI != null && !
namespaces.get(prefix).equals(namespaceURI)) ) {
+ if ( namespaceURI != "" && // this first condition is needed for
woodstox and allto to work
+ (! namespaces.containsKey(prefix) ||
+ ( namespaceURI != null && !
namespaces.get(prefix).equals(namespaceURI)) )) {
// Define in current XML subtree.
outputNS.put(prefix, namespaceURI);
namespaces.put(prefix, namespaceURI);
@@ -1126,12 +1149,6 @@ class ParserRRX_StAX_SR {
// ---- Nodes
- private void setBase(String uriStr, Location location) {
- Node n = iriResolve(uriStr, location);
- parserProfile.setBaseIRI(n.getURI());
- }
-
- /** This is the RDF rule for creating an IRI from a QName. */
private Node qNameToIRI(QName qName, QNameUsage usage, Location location) {
if ( StringUtils.isBlank(qName.getNamespaceURI()) )
throw RDFXMLparseError("Unqualified "+usage.msg+" not allowed:
<"+qName.getLocalPart()+">", location);
@@ -1331,11 +1348,8 @@ class ParserRRX_StAX_SR {
}
boolean hasFrame = (xmlBase != null || xmlLang != null);
if ( hasFrame ) {
- pushFrame(currentBase, currentLang);
- if ( xmlBase != null )
- currentBase = xmlBase;
- if ( xmlLang != null )
- currentLang = xmlLang;
+ pushFrame(xmlBase != null ? xmlBase : currentBase,
+ xmlLang != null ? xmlLang : currentLang);
}
return hasFrame;
}
@@ -1369,8 +1383,8 @@ class ParserRRX_StAX_SR {
emitBase(xmlBase);
int numNS = xmlSource.getNamespaceCount();
for ( int i = 0 ; i < numNS ; i++ ) {
+ final String prefixURI = xmlSource.getNamespaceURI(i);
String prefix = xmlSource.getNamespacePrefix(i);
- String prefixURI = xmlSource.getNamespaceURI(i);
if ( prefix == null )
prefix = "";
emitPrefix(prefix, prefixURI);
@@ -1452,18 +1466,11 @@ class ParserRRX_StAX_SR {
private Node iriResolve(String uriStr, Location location) {
Objects.requireNonNull(uriStr);
Objects.requireNonNull(location);
- String resolved = resolveIRI(uriStr, location);
- int line = location.getLineNumber();
- int col = location.getColumnNumber();
- return parserProfile.createURI(resolved, line, col);
- }
-
- /** Resolve an IRI. */
- private String resolveIRI(String uriStr, Location location) {
- if ( uriStr.startsWith("_:") )
- // <_:label> syntax. Handled by the FactoryRDF via the parser
profile.
- return uriStr;
- return resolveIRIx(uriStr, location).str();
+ final int line = location.getLineNumber();
+ final int col = location.getColumnNumber();
+ return uriStr.startsWith("_:")
+ ? parserProfile.createURI(uriStr, line, col) // <_:label>
syntax. Handled by the FactoryRDF via the parser profile.
+ : parserProfile.createURI(resolveIRIx(uriStr, location),
line, col);
}
private IRIx resolveIRIx(String uriStr, Location location) {
@@ -1480,10 +1487,13 @@ class ParserRRX_StAX_SR {
private IRIx resolveIRIxAny(String uriStr, Location location) {
try {
- IRIx iri = ( currentBase != null )
- ? currentBase.resolve(uriStr)
- : IRIx.create(uriStr);
- return iri;
+ return currentIriCache.get(uriStr, uri -> {
+ if( currentBase != null ) {
+ return currentBase.resolve(uri);
+ } else {
+ return IRIx.create(uriStr);
+ }
+ });
} catch (IRIException ex) {
throw RDFXMLparseError(ex.getMessage(), location);
}
@@ -1505,13 +1515,6 @@ class ParserRRX_StAX_SR {
return parserProfile.createBlankNode(null, label, line, col);
}
-// private Node literal(String lex, String datatype, String lang, Location
location) {
-// int line = location.getLineNumber();
-// int col = location.getColumnNumber();
-// return parserProfile.createL
-// }
- // literal(lex, datatype, lang)
-
private Node literal(String lexical, Location location) {
Objects.requireNonNull(lexical);
Objects.requireNonNull(location);
@@ -1741,5 +1744,4 @@ class ParserRRX_StAX_SR {
}
throw new RDFXMLParseException("Failed to find any non-whitespace
characters");
}
-
}
diff --git
a/jena-arq/src/main/java/org/apache/jena/riot/system/ParserProfile.java
b/jena-arq/src/main/java/org/apache/jena/riot/system/ParserProfile.java
index de630ba467..fc454c9a12 100644
--- a/jena-arq/src/main/java/org/apache/jena/riot/system/ParserProfile.java
+++ b/jena-arq/src/main/java/org/apache/jena/riot/system/ParserProfile.java
@@ -22,6 +22,7 @@ import org.apache.jena.datatypes.RDFDatatype;
import org.apache.jena.graph.Graph;
import org.apache.jena.graph.Node;
import org.apache.jena.graph.Triple;
+import org.apache.jena.irix.IRIx;
import org.apache.jena.riot.tokens.Token;
import org.apache.jena.sparql.core.Quad;
@@ -49,9 +50,12 @@ public interface ParserProfile {
/** Create a quad */
public Quad createQuad(Node graph, Node subject, Node predicate, Node
object, long line, long col);
- /** Create a URI Node */
+ /** Create a URI Node, where 'uriStr' could also be a blank node. */
public Node createURI(String uriStr, long line, long col);
+ /** Create a URI Node */
+ public Node createURI(IRIx iriX, long line, long col);
+
/** Create a literal for a string+datatype */
public Node createTypedLiteral(String lexical, RDFDatatype datatype, long
line, long col);
diff --git
a/jena-arq/src/main/java/org/apache/jena/riot/system/ParserProfileStd.java
b/jena-arq/src/main/java/org/apache/jena/riot/system/ParserProfileStd.java
index 48c6f3c82b..70b28feba1 100644
--- a/jena-arq/src/main/java/org/apache/jena/riot/system/ParserProfileStd.java
+++ b/jena-arq/src/main/java/org/apache/jena/riot/system/ParserProfileStd.java
@@ -199,6 +199,11 @@ public class ParserProfileStd implements ParserProfile {
return factory.createURI(x);
}
+ @Override
+ public Node createURI(IRIx iriX, long line, long col) {
+ return factory.createURI(iriX.str());
+ }
+
@Override
public Node createTypedLiteral(String lexical, RDFDatatype datatype, long
line, long col) {
if ( checking )
diff --git
a/jena-arq/src/main/java/org/apache/jena/riot/system/ParserProfileWrapper.java
b/jena-arq/src/main/java/org/apache/jena/riot/system/ParserProfileWrapper.java
index 891cc4df94..c531c1e713 100644
---
a/jena-arq/src/main/java/org/apache/jena/riot/system/ParserProfileWrapper.java
+++
b/jena-arq/src/main/java/org/apache/jena/riot/system/ParserProfileWrapper.java
@@ -22,6 +22,7 @@ import org.apache.jena.datatypes.RDFDatatype;
import org.apache.jena.graph.Graph;
import org.apache.jena.graph.Node;
import org.apache.jena.graph.Triple;
+import org.apache.jena.irix.IRIx;
import org.apache.jena.riot.tokens.Token;
import org.apache.jena.sparql.core.Quad;
@@ -62,6 +63,11 @@ public class ParserProfileWrapper implements ParserProfile
return get().createURI(uriStr, line, col);
}
+ @Override
+ public Node createURI(IRIx iriX, long line, long col) {
+ return get().createURI(iriX, line, col);
+ }
+
@Override
public Node createTypedLiteral(String lexical, RDFDatatype datatype, long
line, long col) {
return get().createTypedLiteral(lexical, datatype, line, col);
diff --git a/jena-arq/src/test/java/org/apache/jena/system/TestReadXML.java
b/jena-arq/src/test/java/org/apache/jena/system/TestReadXML.java
index 7cfdf874bb..e3c2b74e12 100644
--- a/jena-arq/src/test/java/org/apache/jena/system/TestReadXML.java
+++ b/jena-arq/src/test/java/org/apache/jena/system/TestReadXML.java
@@ -72,10 +72,19 @@ public class TestReadXML {
assertEquals("XMLInputFactory.SUPPORT_DTD",
Boolean.FALSE,
xf.getProperty(XMLInputFactory.SUPPORT_DTD));
- // Java19. Setting ACCESS_EXTERNAL_DTD to "" now returns "" whereas it
was returning null.
- Object obj = xf.getProperty(XMLConstants.ACCESS_EXTERNAL_DTD);
- boolean noAccessExternalDTD = ( (obj == null) || ((obj instanceof
String) && ((String)obj).isEmpty()) );
- assertTrue("XMLConstants.ACCESS_EXTERNAL_DTD", noAccessExternalDTD);
+
+ String name = xf.getClass().getName();
+ boolean isWoodstox = name.startsWith("com.ctc.wstx.stax.");
+ boolean isAalto = name.startsWith("com.fasterxml.aalto.");
+ if(!isWoodstox && !isAalto) {
+ // Not supported by Woodstox or Aalto.
IS_SUPPORTING_EXTERNAL_ENTITIES = false is enough.
+ // Disable external DTDs (files and HTTP) - errors unless
SUPPORT_DTD is false.
+
+ // Java19. Setting ACCESS_EXTERNAL_DTD to "" now returns ""
whereas it was returning null.
+ Object obj = xf.getProperty(XMLConstants.ACCESS_EXTERNAL_DTD);
+ boolean noAccessExternalDTD = ( (obj == null) || ((obj instanceof
String) && ((String)obj).isEmpty()) );
+ assertTrue("XMLConstants.ACCESS_EXTERNAL_DTD",
noAccessExternalDTD);
+ }
assertEquals("javax.xml.stream.isSupportingExternalEntities",
Boolean.FALSE,xf.getProperty("javax.xml.stream.isSupportingExternalEntities"));
diff --git
a/jena-benchmarks/jena-benchmarks-jmh/src/test/java/org/apache/jena/riot/lang/rdfxml/TestXMLParser.java
b/jena-benchmarks/jena-benchmarks-jmh/src/test/java/org/apache/jena/riot/lang/rdfxml/TestXMLParser.java
new file mode 100644
index 0000000000..e3a6c439c7
--- /dev/null
+++
b/jena-benchmarks/jena-benchmarks-jmh/src/test/java/org/apache/jena/riot/lang/rdfxml/TestXMLParser.java
@@ -0,0 +1,143 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.jena.riot.lang.rdfxml;
+
+import org.apache.commons.io.input.BufferedFileChannelInputStream;
+import org.apache.jena.graph.Graph;
+import org.apache.jena.mem.graph.helper.JMHDefaultOptions;
+import org.apache.jena.mem2.GraphMem2Fast;
+import org.apache.jena.riot.Lang;
+import org.apache.jena.riot.RDFParser;
+import org.junit.Assert;
+import org.junit.Test;
+import org.openjdk.jmh.annotations.*;
+import org.openjdk.jmh.runner.Runner;
+
+import java.nio.file.StandardOpenOption;
+
+@State(Scope.Benchmark)
+public class TestXMLParser {
+
+ @Param({
+ "../testing/pizza.owl.rdf",
+// "../testing/citations.rdf",
+// "../testing/BSBM/bsbm-5m.xml",
+
+ })
+ public String param0_GraphUri;
+
+ @Param({
+ "RRX.RDFXML_SAX",
+ "RRX.RDFXML_StAX_ev",
+ "RRX.RDFXML_StAX_sr",
+
+// "RRX.RDFXML_ARP0",
+ "RRX.RDFXML_ARP1"
+ })
+ public String param1_ParserLang;
+
+
+ private static Lang getLang(String langName) {
+ switch (langName) {
+ case "RRX.RDFXML_SAX":
+ return RRX.RDFXML_SAX;
+ case "RRX.RDFXML_StAX_ev":
+ return RRX.RDFXML_StAX_ev;
+ case "RRX.RDFXML_StAX_sr":
+ return RRX.RDFXML_StAX_sr;
+
+ case "RRX.RDFXML_ARP0":
+ return RRX.RDFXML_ARP0;
+ case "RRX.RDFXML_ARP1":
+ return RRX.RDFXML_ARP1;
+
+ default:
+ throw new IllegalArgumentException("Unknown lang: " +
langName);
+ }
+ }
+
+ private static org.apache.shadedJena510.riot.Lang getLangJena510(String
langName) {
+ switch (langName) {
+ case "RRX.RDFXML_SAX":
+ return
org.apache.shadedJena510.riot.lang.rdfxml.RRX.RDFXML_SAX;
+ case "RRX.RDFXML_StAX_ev":
+ return
org.apache.shadedJena510.riot.lang.rdfxml.RRX.RDFXML_StAX_ev;
+ case "RRX.RDFXML_StAX_sr":
+ return
org.apache.shadedJena510.riot.lang.rdfxml.RRX.RDFXML_StAX_sr;
+
+ case "RRX.RDFXML_ARP0":
+ return
org.apache.shadedJena510.riot.lang.rdfxml.RRX.RDFXML_ARP0;
+ case "RRX.RDFXML_ARP1":
+ return
org.apache.shadedJena510.riot.lang.rdfxml.RRX.RDFXML_ARP1;
+
+ default:
+ throw new IllegalArgumentException("Unknown lang: " +
langName);
+ }
+ }
+
+ @Benchmark
+ public Graph parseXML() throws Exception {
+ final var graph = new GraphMem2Fast();
+ try(final var is = new BufferedFileChannelInputStream.Builder()
+ .setFile(this.param0_GraphUri)
+ .setOpenOptions(StandardOpenOption.READ)
+ .setBufferSize(64*4096)
+ .get()) {
+ RDFParser.source(is)
+ .base("xx:")
+ .forceLang(getLang(this.param1_ParserLang))
+ .checking(false)
+ .parse(graph);
+ }
+ return graph;
+ }
+
+ @Benchmark
+ public org.apache.shadedJena510.graph.Graph parseXMLJena510() throws
Exception {
+ final var graph = new org.apache.shadedJena510.mem2.GraphMem2Fast();
+ try(final var is = new BufferedFileChannelInputStream.Builder()
+ .setFile(this.param0_GraphUri)
+ .setOpenOptions(StandardOpenOption.READ)
+ .setBufferSize(64*4096)
+ .get()) {
+ org.apache.shadedJena510.riot.RDFParser.source(is)
+ .base("xx:")
+ .forceLang(getLangJena510(this.param1_ParserLang))
+ .checking(false)
+ .parse(graph);
+ }
+ return graph;
+ }
+
+ @Setup(Level.Trial)
+ public void setup() {
+ org.apache.shadedJena510.riot.lang.rdfxml.RRX.register();
+ }
+
+ @Test
+ public void benchmark() throws Exception {
+ var opt = JMHDefaultOptions.getDefaults(this.getClass())
+ .warmupIterations(2)
+ .measurementIterations(4)
+ .build();
+ var results = new Runner(opt).run();
+ Assert.assertNotNull(results);
+ }
+
+}
diff --git a/jena-core/src/main/java/org/apache/jena/util/JenaXMLInput.java
b/jena-core/src/main/java/org/apache/jena/util/JenaXMLInput.java
index 498f86d212..0283be1765 100644
--- a/jena-core/src/main/java/org/apache/jena/util/JenaXMLInput.java
+++ b/jena-core/src/main/java/org/apache/jena/util/JenaXMLInput.java
@@ -134,6 +134,7 @@ public class JenaXMLInput {
String name = xmlInputFactory.getClass().getName();
boolean isWoodstox = name.startsWith("com.ctc.wstx.stax.");
+ boolean isAalto = name.startsWith("com.fasterxml.aalto.");
boolean isJDK = name.contains("sun.xml.internal");
boolean isXerces = name.startsWith("org.apache.xerces");
@@ -146,9 +147,9 @@ public class JenaXMLInput {
// disable external entities (silently ignore)
setXMLInputFactoryProperty(xmlInputFactory,
XMLInputFactory.IS_SUPPORTING_EXTERNAL_ENTITIES, Boolean.FALSE);
- // Not supported by Woodstox. IS_SUPPORTING_EXTERNAL_ENTITIES = false
is enough.
+ // Not supported by Woodstox or Aalto. IS_SUPPORTING_EXTERNAL_ENTITIES
= false is enough.
// Disable external DTDs (files and HTTP) - errors unless SUPPORT_DTD
is false.
- if ( ! isWoodstox )
+ if ( ! isWoodstox && ! isAalto)
setXMLInputFactoryProperty(xmlInputFactory,
XMLConstants.ACCESS_EXTERNAL_DTD, "");
return xmlInputFactory;