This is an automated email from the ASF dual-hosted git repository.

andy pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/jena.git


The following commit(s) were added to refs/heads/main by this push:
     new 2c9782c6d7 GH-2740: Faster parsing of RDF/XML
2c9782c6d7 is described below

commit 2c9782c6d7647cf29124ca281887f81a83fb7e24
Author: arne-bdt <[email protected]>
AuthorDate: Sat Sep 28 22:14:56 2024 +0200

    GH-2740: Faster parsing of RDF/XML
    
    Parsers: RRX.RDFXML_SAX, RRX.RDFXML_StAX_ev, RRX.RDFXML_StAX_sr
    
    - added "public Node createURI(IRIx iriX, ...);" to the ParserProfile, 
which simply uses the given IRI instead of resolving it again.
    - adding general IRIx caching (org.apache.jena.atlas.lib.cache.CacheSimple) 
in the parsers
      where the already cached 
org.apache.jena.riot.system.ParserProfileStd#resolver is not applicable
    - removed unused code and variables from ParserRRX_StAX_SR and 
ParserRRX_StAX_EV
    - added `org.apache.jena.riot.lang.rdfxml.TestXMLParser` in 
jena-benchmarks-jmh
---
 .../org/apache/jena/riot/lang/rdfxml/SysRRX.java   |  15 ++-
 .../jena/riot/lang/rdfxml/rrx/ParserRRX_SAX.java   |  78 ++++++++---
 .../lang/rdfxml/rrx_stax_ev/ParserRRX_StAX_EV.java | 115 +++++++++--------
 .../lang/rdfxml/rrx_stax_sr/ParserRRX_StAX_SR.java | 118 ++++++++---------
 .../org/apache/jena/riot/system/ParserProfile.java |   6 +-
 .../apache/jena/riot/system/ParserProfileStd.java  |   5 +
 .../jena/riot/system/ParserProfileWrapper.java     |   6 +
 .../java/org/apache/jena/system/TestReadXML.java   |  17 ++-
 .../jena/riot/lang/rdfxml/TestXMLParser.java       | 143 +++++++++++++++++++++
 .../java/org/apache/jena/util/JenaXMLInput.java    |   5 +-
 10 files changed, 370 insertions(+), 138 deletions(-)

diff --git 
a/jena-arq/src/main/java/org/apache/jena/riot/lang/rdfxml/SysRRX.java 
b/jena-arq/src/main/java/org/apache/jena/riot/lang/rdfxml/SysRRX.java
index 17a6173b26..90deea6e18 100644
--- a/jena-arq/src/main/java/org/apache/jena/riot/lang/rdfxml/SysRRX.java
+++ b/jena-arq/src/main/java/org/apache/jena/riot/lang/rdfxml/SysRRX.java
@@ -29,8 +29,21 @@ import org.apache.jena.util.JenaXMLInput;
  */
 public class SysRRX {
 
+    /**
+     * Creates and initializes a 
javax.xml.stream.XMLInputFactory#newInstance().
+     * @return XMLInputFactory
+     */
     public static XMLInputFactory createXMLInputFactory() {
-        XMLInputFactory xmlInputFactory = XMLInputFactory.newInstance();
+        return initAndConfigure(XMLInputFactory.newInstance());
+    }
+
+    /**
+     * Configures the parser to be safe and sets necessary properties.
+     * This method should be called when a factory other than
+     * javax.xml.stream.XMLInputFactory#newInstance() is used.
+     * @param xmlInputFactory
+     */
+    public static <E extends XMLInputFactory> E initAndConfigure(final E 
xmlInputFactory) {
         JenaXMLInput.initXMLInputFactory(xmlInputFactory);
         // Additional features. Enable character entity support.
         xmlInputFactory.setProperty(XMLInputFactory.SUPPORT_DTD, Boolean.TRUE);
diff --git 
a/jena-arq/src/main/java/org/apache/jena/riot/lang/rdfxml/rrx/ParserRRX_SAX.java
 
b/jena-arq/src/main/java/org/apache/jena/riot/lang/rdfxml/rrx/ParserRRX_SAX.java
index d6aa5c68e1..30daf48700 100644
--- 
a/jena-arq/src/main/java/org/apache/jena/riot/lang/rdfxml/rrx/ParserRRX_SAX.java
+++ 
b/jena-arq/src/main/java/org/apache/jena/riot/lang/rdfxml/rrx/ParserRRX_SAX.java
@@ -29,6 +29,8 @@ import javax.xml.namespace.QName;
 
 import org.apache.commons.lang3.StringUtils;
 import org.apache.jena.atlas.io.IndentedWriter;
+import org.apache.jena.atlas.lib.Cache;
+import org.apache.jena.atlas.lib.CacheFactory;
 import org.apache.jena.atlas.lib.EscapeStr;
 import org.apache.jena.datatypes.RDFDatatype;
 import org.apache.jena.datatypes.xsd.impl.XMLLiteralType;
@@ -61,6 +63,7 @@ class ParserRRX_SAX
             LexicalHandler,
             DeclHandler,
             EntityResolver2 {
+    private static int IRI_CACHE_SIZE = 8192;
     private static boolean VERBOSE = false;
     // Addition tracing for SAX events we don't care about.
     private static boolean EVENTS = false;
@@ -299,8 +302,8 @@ class ParserRRX_SAX
                                Counter containerPropertyCounter,
                                NodeHolder collectionNode,
                                Emitter emitter,
-                               ParserMode parserMode
-                               ) {}
+                               ParserMode parserMode,
+                               Cache<String, IRIx> iriCache) {}
 
     private Deque<ParserFrame> parserStack = new ArrayDeque<>();
 
@@ -320,7 +323,8 @@ class ParserRRX_SAX
                                             containerPropertyCounter,
                                             collectionNode,
                                             currentEmitter,
-                                            frameParserMode);
+                                            frameParserMode,
+                                            currentIriCache);
         parserStack.push(frame);
     }
 
@@ -331,8 +335,10 @@ class ParserRRX_SAX
             trace.printf("Pop frame: S: %s -> %s : P: %s -> %s\n", 
str(currentSubject), frame.subject,
                          str(currentProperty), frame.property);
         }
-
-        this.currentBase = frame.base;
+        if(isDifferentFromCurrentBase(frame.base)) {
+            this.currentBase = frame.base;
+            this.currentIriCache = frame.iriCache;
+        }
         this.currentLang = frame.lang;
         this.currentSubject = frame.subject;
         this.currentProperty = frame.property;
@@ -382,6 +388,32 @@ class ParserRRX_SAX
     private final String initialXmlBase;
     private final String initialXmlLang;
     private final StreamRDF destination;
+    private Cache<String, IRIx> iriCacheForBaseNull = null;
+    private Cache<String, IRIx> currentIriCache = null;
+    private final Map<IRIx, Cache<String, IRIx>> mapBaseIriToCache = new 
HashMap<>();
+
+    private void updateCurrentIriCacheForCurrentBase() {
+        if(currentBase != null) {
+            currentIriCache = mapBaseIriToCache
+                    .computeIfAbsent(currentBase,
+                            b -> CacheFactory.createSimpleCache(IRI_CACHE_SIZE)
+                    );
+        } else {
+            if(iriCacheForBaseNull == null) {
+                iriCacheForBaseNull = 
CacheFactory.createSimpleCache(IRI_CACHE_SIZE);
+            }
+            currentIriCache = iriCacheForBaseNull;
+        }
+    }
+
+    private boolean isDifferentFromCurrentBase(IRIx base) {
+        if(currentBase != null) {
+            return !currentBase.equals(base);
+        } else if(base == null) {
+            return false;
+        }
+        return true;
+    }
 
     // Tracking for ID on nodes (not reification usage)
     // We limit the number of local fragment IDs tracked because map only 
grows.
@@ -476,6 +508,7 @@ class ParserRRX_SAX
         } else {
             this.currentBase = null;
         }
+        updateCurrentIriCacheForCurrentBase();
         this.currentLang = "";
         this.destination = destination;
     }
@@ -655,7 +688,11 @@ class ParserRRX_SAX
         String xmlBaseURI = attributes.getValue(xmlNS, xmlBaseLN);
         if ( xmlBaseURI != null ) {
             emitBase(xmlBaseURI, position);
-            currentBase = resolveIRIx(xmlBaseURI, position);
+            var newBase = resolveIRIx(xmlBaseURI, position);
+            if(!newBase.equals(currentBase)) {
+                currentBase = newBase;
+                updateCurrentIriCacheForCurrentBase();
+            }
         }
 
         for ( int i = 0 ; i < attributes.getLength() ; i++ ) {
@@ -962,8 +999,9 @@ class ParserRRX_SAX
             if ( xmlLang != null )
                 trace.printf("+ LANG @%s\n", xmlLang);
         }
-        if ( xmlBase != null ) {
+        if ( xmlBase != null && !xmlBase.equals(currentBase)) {
             currentBase = xmlBase;// resolve.
+            updateCurrentIriCacheForCurrentBase();
         }
 
         if ( xmlLang != null )
@@ -1364,11 +1402,9 @@ class ParserRRX_SAX
     private Node iriResolve(String uriStr, Position position) {
         Objects.requireNonNull(uriStr);
         Objects.requireNonNull(position);
-        if ( uriStr.startsWith("_:") )
-            // <_:label> syntax. Handled by the FactoryRDF via the parser 
profile.
-            return createURI(uriStr, position);
-        String resolved =  resolveIRIx(uriStr, position).str();
-        return createURI(resolved, position);
+        return uriStr.startsWith("_:")
+                ?  createURI(uriStr, position) // <_:label> syntax. Handled by 
the FactoryRDF via the parser profile.
+                :  createURI(resolveIRIx(uriStr, position), position);
     }
 
     private IRIx resolveIRIx(String uriStr, Position position) {
@@ -1386,10 +1422,13 @@ class ParserRRX_SAX
     /** String to IRIx, no opinion */
     private IRIx resolveIRIxAny(String uriStr, Position position) {
         try {
-            IRIx iri = ( currentBase != null )
-                    ? currentBase.resolve(uriStr)
-                    : IRIx.create(uriStr);
-            return iri;
+            return currentIriCache.get(uriStr, uri -> {
+                if( currentBase != null ) {
+                    return currentBase.resolve(uri);
+                } else {
+                    return IRIx.create(uriStr);
+                }
+            });
         } catch (IRIException ex) {
             throw RDFXMLparseError(ex.getMessage(), position);
         }
@@ -1403,6 +1442,13 @@ class ParserRRX_SAX
         return parserProfile.createURI(iriStr, line, col);
     }
 
+    private Node createURI(IRIx iriX, Position position) {
+        int line = position.line();
+        int col = position.column();
+        // Checking
+        return parserProfile.createURI(iriX, line, col);
+    }
+
     private Node blankNode(Position position) {
         Objects.requireNonNull(position);
         int line = position.line();
diff --git 
a/jena-arq/src/main/java/org/apache/jena/riot/lang/rdfxml/rrx_stax_ev/ParserRRX_StAX_EV.java
 
b/jena-arq/src/main/java/org/apache/jena/riot/lang/rdfxml/rrx_stax_ev/ParserRRX_StAX_EV.java
index eaefd06559..e30fe31b77 100644
--- 
a/jena-arq/src/main/java/org/apache/jena/riot/lang/rdfxml/rrx_stax_ev/ParserRRX_StAX_EV.java
+++ 
b/jena-arq/src/main/java/org/apache/jena/riot/lang/rdfxml/rrx_stax_ev/ParserRRX_StAX_EV.java
@@ -31,6 +31,8 @@ import javax.xml.stream.events.*;
 
 import org.apache.commons.lang3.StringUtils;
 import org.apache.jena.atlas.io.IndentedWriter;
+import org.apache.jena.atlas.lib.Cache;
+import org.apache.jena.atlas.lib.CacheFactory;
 import org.apache.jena.atlas.lib.EscapeStr;
 import org.apache.jena.datatypes.RDFDatatype;
 import org.apache.jena.datatypes.xsd.impl.XMLLiteralType;
@@ -51,44 +53,70 @@ import org.apache.jena.vocabulary.RDF;
 
 /** StAX events */
 class ParserRRX_StAX_EV {
+    private static int IRI_CACHE_SIZE = 8192;
     private static boolean EVENTS = false;
     private final IndentedWriter trace;
 
     private final XMLEventReader xmlEventReader;
+
+    private Cache<String, IRIx> iriCacheForBaseNull = null;
+    private Cache<String, IRIx> currentIriCache = null;
+    private final Map<IRIx, Cache<String, IRIx>> mapBaseIriToCache = new 
HashMap<>();
     // Stacks.
 
     // Constants
-    private static final String XML_PREFIX = "xml";
     private static final String rdfNS = RDF.uri;
     private static final String xmlNS = "http://www.w3.org/XML/1998/namespace";;
-    private static final String ID = "ID";
-    private static final String NODE_ID = "nodeID";
-    private static final String ABOUT = "about";
-    private int blankNodeCounter  = 0 ;
     private boolean hasRDF = false;
 
     private final ParserProfile parserProfile;
     private final ErrorHandler errorHandler;
-    private final Context context;
-    private final String initialXmlBase;
-    private final String initialXmlLang;
     private final StreamRDF destination;
 
-    private record BaseLang(IRIx base, String lang) {}
+    private void updateCurrentIriCacheForCurrentBase() {
+        if(currentBase != null) {
+            currentIriCache = mapBaseIriToCache
+                    .computeIfAbsent(currentBase,
+                            b -> CacheFactory.createSimpleCache(IRI_CACHE_SIZE)
+                    );
+        } else {
+            if(iriCacheForBaseNull == null) {
+                iriCacheForBaseNull = 
CacheFactory.createSimpleCache(IRI_CACHE_SIZE);
+            }
+            currentIriCache = iriCacheForBaseNull;
+        }
+    }
+
+    private boolean isDifferentFromCurrentBase(IRIx base) {
+        if(currentBase != null) {
+            return !currentBase.equals(base);
+        } else if(base == null) {
+            return false;
+        }
+        return true;
+    }
+
+    private record BaseLang(IRIx base, String lang, Cache<String, IRIx> 
iriCache) {}
     private Deque<BaseLang> stack = new ArrayDeque<>();
     // Just these operations:
 
     private void pushFrame(IRIx base, String lang) {
-        BaseLang frame = new BaseLang(currentBase, currentLang);
+        BaseLang frame = new BaseLang(currentBase, currentLang, 
currentIriCache);
         stack.push(frame);
-        currentBase = base;
         currentLang = lang;
+        if(isDifferentFromCurrentBase(base)) {
+            currentBase = base;
+            updateCurrentIriCacheForCurrentBase();
+        }
     }
 
     private void popFrame() {
         BaseLang frame = stack.pop();
-        currentBase = frame.base;
         currentLang = frame.lang;
+        if(isDifferentFromCurrentBase(frame.base)) {
+            currentBase = frame.base;
+            currentIriCache = frame.iriCache;
+        }
     }
 
     /** Mark the usage of a QName */
@@ -165,16 +193,14 @@ class ParserRRX_StAX_EV {
 
         this.xmlEventReader = reader;
         this.parserProfile = parserProfile;
-        this.context = context;
         this.errorHandler = parserProfile.getErrorHandler();
-        this.initialXmlBase = xmlBase;
-        this.initialXmlLang = "";
         if ( xmlBase != null ) {
             this.currentBase = IRIx.create(xmlBase);
-            //parserProfile.setBaseIRI(currentBase.str());
+            parserProfile.setBaseIRI(currentBase.str());
         } else {
             this.currentBase = null;
         }
+        updateCurrentIriCacheForCurrentBase();
         this.currentLang = "";
         this.destination = destination;
     }
@@ -186,10 +212,6 @@ class ParserRRX_StAX_EV {
     private static final QName rdfAbout = new QName(rdfNS, "about");
     private static final QName rdfType = new QName(rdfNS, "type");
 
-    private static final QName rdfSeq = new QName(rdfNS, "Seq");
-    private static final QName rdfBag = new QName(rdfNS, "Bag");
-    private static final QName rdfAlt = new QName(rdfNS, "Alt");
-
     private static final QName rdfContainerItem = new QName(rdfNS, "li");
     private static final QName rdfDatatype = new QName(rdfNS, "datatype");
     private static final QName rdfParseType = new QName(rdfNS, "parseType");
@@ -985,8 +1007,9 @@ class ParserRRX_StAX_EV {
         }
 
         // Not seen this prefix or it was a different value.
-        if ( ! namespaces.containsKey(prefix) ||
-                ( namespaceURI != null && ! 
namespaces.get(prefix).equals(namespaceURI)) ) {
+        if ( namespaceURI != "" &&  // this first condition is needed for 
woodstox and allto to work
+                (! namespaces.containsKey(prefix) ||
+                 ( namespaceURI != null && ! 
namespaces.get(prefix).equals(namespaceURI)) )) {
             // Define in current XML subtree.
             outputNS.put(prefix, namespaceURI);
             namespaces.put(prefix, namespaceURI);
@@ -1170,12 +1193,6 @@ class ParserRRX_StAX_EV {
 
     // ---- Nodes
 
-    private void setBase(String uriStr, Location location) {
-        // Resolves
-        Node n = iriResolve(uriStr, location);
-        parserProfile.setBaseIRI(n.getURI());
-    }
-
     private Node qNameToIRI(QName qName, QNameUsage usage, Location location) {
         if ( StringUtils.isBlank(qName.getNamespaceURI()) )
             throw RDFXMLparseError("Unqualified "+usage.msg+" not allowed: 
<"+qName.getLocalPart()+">", location);
@@ -1375,11 +1392,8 @@ class ParserRRX_StAX_EV {
         }
         boolean hasFrame = (xmlBase != null || xmlLang != null);
         if ( hasFrame ) {
-            pushFrame(currentBase, currentLang);
-            if ( xmlBase != null )
-                currentBase = xmlBase;
-            if ( xmlLang != null )
-                currentLang = xmlLang;
+            pushFrame(xmlBase != null ? xmlBase : currentBase,
+                    xmlLang != null ? xmlLang : currentLang);
         }
         return hasFrame;
     }
@@ -1475,18 +1489,11 @@ class ParserRRX_StAX_EV {
     private Node iriResolve(String uriStr, Location location) {
         Objects.requireNonNull(uriStr);
         Objects.requireNonNull(location);
-        int line = location.getLineNumber();
-        int col = location.getColumnNumber();
-        String resolved = resolveIRI(uriStr, location);
-        return parserProfile.createURI(resolved, line, col);
-    }
-
-    /** Resolve an IRI. */
-    private String resolveIRI(String uriStr, Location location) {
-        if ( uriStr.startsWith("_:") )
-            // <_:label> syntax. Handled by the FactoryRDF via the parser 
profile.
-            return uriStr;
-        return resolveIRIx(uriStr, location).str();
+        final int line = location.getLineNumber();
+        final int col = location.getColumnNumber();
+        return uriStr.startsWith("_:")
+                ?  parserProfile.createURI(uriStr, line, col) // <_:label> 
syntax. Handled by the FactoryRDF via the parser profile.
+                :  parserProfile.createURI(resolveIRIx(uriStr, location), 
line, col);
     }
 
     private IRIx resolveIRIx(String uriStr, Location location) {
@@ -1503,10 +1510,13 @@ class ParserRRX_StAX_EV {
 
     private IRIx resolveIRIxNoWarning(String uriStr, Location location) {
         try {
-            IRIx iri = ( currentBase != null )
-                    ? currentBase.resolve(uriStr)
-                    : IRIx.create(uriStr);
-            return iri;
+            return currentIriCache.get(uriStr, uri -> {
+                if( currentBase != null ) {
+                    return currentBase.resolve(uri);
+                } else {
+                    return IRIx.create(uriStr);
+                }
+            });
         } catch (IRIException ex) {
             throw RDFXMLparseError(ex.getMessage(), location);
         }
@@ -1528,13 +1538,6 @@ class ParserRRX_StAX_EV {
         return parserProfile.createBlankNode(null, label, line, col);
     }
 
-//    private Node literal(String lex, String datatype, String lang, Location 
location) {
-//        int line = location.getLineNumber();
-//        int col = location.getColumnNumber();
-//        return parserProfile.createL
-//    }
-    // literal(lex, datatype, lang)
-
     private Node literal(String lexical, Location location) {
         Objects.requireNonNull(lexical);
         Objects.requireNonNull(location);
diff --git 
a/jena-arq/src/main/java/org/apache/jena/riot/lang/rdfxml/rrx_stax_sr/ParserRRX_StAX_SR.java
 
b/jena-arq/src/main/java/org/apache/jena/riot/lang/rdfxml/rrx_stax_sr/ParserRRX_StAX_SR.java
index 9f11b5954f..f021a8e1c4 100644
--- 
a/jena-arq/src/main/java/org/apache/jena/riot/lang/rdfxml/rrx_stax_sr/ParserRRX_StAX_SR.java
+++ 
b/jena-arq/src/main/java/org/apache/jena/riot/lang/rdfxml/rrx_stax_sr/ParserRRX_StAX_SR.java
@@ -33,6 +33,8 @@ import javax.xml.stream.events.XMLEvent;
 
 import org.apache.commons.lang3.StringUtils;
 import org.apache.jena.atlas.io.IndentedWriter;
+import org.apache.jena.atlas.lib.Cache;
+import org.apache.jena.atlas.lib.CacheFactory;
 import org.apache.jena.atlas.lib.EscapeStr;
 import org.apache.jena.datatypes.RDFDatatype;
 import org.apache.jena.datatypes.xsd.impl.XMLLiteralType;
@@ -54,44 +56,70 @@ import org.apache.jena.vocabulary.RDF.Nodes;
 
 /* StAX - stream reader */
 class ParserRRX_StAX_SR {
+    private static int IRI_CACHE_SIZE = 8192;
     private static boolean EVENTS = false;
     private final IndentedWriter trace;
 
     private final XMLStreamReader xmlSource;
+
+    private Cache<String, IRIx> iriCacheForBaseNull = null;
+    private Cache<String, IRIx> currentIriCache = null;
+    private final Map<IRIx, Cache<String, IRIx>> mapBaseIriToCache = new 
HashMap<>();
     // Stacks.
 
     // Constants
-    private static final String XML_PREFIX = "xml";
     private static final String rdfNS = RDF.uri;
     private static final String xmlNS = "http://www.w3.org/XML/1998/namespace";;
-    private static final String ID = "ID";
-    private static final String NODE_ID = "nodeID";
-    private static final String ABOUT = "about";
-    private int blankNodeCounter  = 0 ;
     private boolean hasRDF = false;
 
     private final ParserProfile parserProfile;
     private final ErrorHandler errorHandler;
-    private final Context context;
-    private final String initialXmlBase;
-    private final String initialXmlLang;
     private final StreamRDF destination;
 
-    private record BaseLang(IRIx base, String lang) {}
+    private void updateCurrentIriCacheForCurrentBase() {
+        if(currentBase != null) {
+            currentIriCache = mapBaseIriToCache
+                    .computeIfAbsent(currentBase,
+                            b -> CacheFactory.createSimpleCache(IRI_CACHE_SIZE)
+                    );
+        } else {
+            if(iriCacheForBaseNull == null) {
+                iriCacheForBaseNull = 
CacheFactory.createSimpleCache(IRI_CACHE_SIZE);
+            }
+            currentIriCache = iriCacheForBaseNull;
+        }
+    }
+
+    private boolean isDifferentFromCurrentBase(IRIx base) {
+        if(currentBase != null) {
+            return !currentBase.equals(base);
+        } else if(base == null) {
+            return false;
+        }
+        return true;
+    }
+
+    private record BaseLang(IRIx base, String lang, Cache<String, IRIx> 
iriCache) {}
     private Deque<BaseLang> stack = new ArrayDeque<>();
     // Just these operations:
 
     private void pushFrame(IRIx base, String lang) {
-        BaseLang frame = new BaseLang(currentBase, currentLang);
+        BaseLang frame = new BaseLang(currentBase, currentLang, 
currentIriCache);
         stack.push(frame);
-        currentBase = base;
         currentLang = lang;
+        if(isDifferentFromCurrentBase(base)) {
+            currentBase = base;
+            updateCurrentIriCacheForCurrentBase();
+        }
     }
 
     private void popFrame() {
         BaseLang frame = stack.pop();
-        currentBase = frame.base;
         currentLang = frame.lang;
+        if(isDifferentFromCurrentBase(frame.base)) {
+            currentBase = frame.base;
+            currentIriCache = frame.iriCache;
+        }
     }
 
     /** Mark the usage of a QName */
@@ -126,6 +154,7 @@ class ParserRRX_StAX_SR {
         else
             errorHandler.warning(message, -1, -1);
     }
+
     // Tracking for ID on nodes (not reification usage)
     // We limit the number of local fragment IDs tracked because map only 
grows.
     // A base URI may be re-introduced so this isn't nested scoping.
@@ -165,16 +194,14 @@ class ParserRRX_StAX_SR {
 
         this.xmlSource = reader;
         this.parserProfile = parserProfile;
-        this.context = context;
         this.errorHandler = parserProfile.getErrorHandler();
-        this.initialXmlBase = xmlBase;
-        this.initialXmlLang = "";
         if ( xmlBase != null ) {
             this.currentBase = IRIx.create(xmlBase);
             parserProfile.setBaseIRI(currentBase.str());
         } else {
             this.currentBase = null;
         }
+        updateCurrentIriCacheForCurrentBase();
         this.currentLang = "";
         this.destination = destination;
     }
@@ -186,10 +213,6 @@ class ParserRRX_StAX_SR {
     private static final QName rdfAbout = new QName(rdfNS, "about");
     private static final QName rdfType = new QName(rdfNS, "type");
 
-    private static final QName rdfSeq = new QName(rdfNS, "Seq");
-    private static final QName rdfBag = new QName(rdfNS, "Bag");
-    private static final QName rdfAlt = new QName(rdfNS, "Alt");
-
     private static final QName rdfContainerItem = new QName(rdfNS, "li");
     private static final QName rdfDatatype = new QName(rdfNS, "datatype");
     private static final QName rdfParseType = new QName(rdfNS, "parseType");
@@ -239,7 +262,6 @@ class ParserRRX_StAX_SR {
         return $coreSyntaxTerms.contains(qName);
     }
 
-
     // 6.2.5 Production nodeElementURIs
     // anyURI - ( coreSyntaxTerms | rdf:li | oldTerms )
     private static boolean allowedNodeElementURIs(QName qName) {
@@ -952,8 +974,9 @@ class ParserRRX_StAX_SR {
         }
 
         // Not seen this prefix or it was a different value.
-        if ( ! namespaces.containsKey(prefix) ||
-                ( namespaceURI != null && ! 
namespaces.get(prefix).equals(namespaceURI)) ) {
+        if ( namespaceURI != "" &&      // this first condition is needed for 
woodstox and allto to work
+                (! namespaces.containsKey(prefix) ||
+                 ( namespaceURI != null && ! 
namespaces.get(prefix).equals(namespaceURI)) )) {
             // Define in current XML subtree.
             outputNS.put(prefix, namespaceURI);
             namespaces.put(prefix, namespaceURI);
@@ -1126,12 +1149,6 @@ class ParserRRX_StAX_SR {
 
     // ---- Nodes
 
-    private void setBase(String uriStr, Location location) {
-        Node n = iriResolve(uriStr, location);
-        parserProfile.setBaseIRI(n.getURI());
-    }
-
-    /** This is the RDF rule for creating an IRI from a QName. */
     private Node qNameToIRI(QName qName, QNameUsage usage, Location location) {
         if ( StringUtils.isBlank(qName.getNamespaceURI()) )
             throw RDFXMLparseError("Unqualified "+usage.msg+" not allowed: 
<"+qName.getLocalPart()+">", location);
@@ -1331,11 +1348,8 @@ class ParserRRX_StAX_SR {
         }
         boolean hasFrame = (xmlBase != null || xmlLang != null);
         if ( hasFrame ) {
-            pushFrame(currentBase, currentLang);
-            if ( xmlBase != null )
-                currentBase = xmlBase;
-            if ( xmlLang != null )
-                currentLang = xmlLang;
+            pushFrame(xmlBase != null ? xmlBase : currentBase,
+                    xmlLang != null ? xmlLang : currentLang);
         }
         return hasFrame;
     }
@@ -1369,8 +1383,8 @@ class ParserRRX_StAX_SR {
             emitBase(xmlBase);
         int numNS = xmlSource.getNamespaceCount();
         for ( int i = 0 ; i < numNS ; i++ ) {
+            final String prefixURI = xmlSource.getNamespaceURI(i);
             String prefix = xmlSource.getNamespacePrefix(i);
-            String prefixURI = xmlSource.getNamespaceURI(i);
             if ( prefix == null )
                 prefix = "";
             emitPrefix(prefix, prefixURI);
@@ -1452,18 +1466,11 @@ class ParserRRX_StAX_SR {
     private Node iriResolve(String uriStr, Location location) {
         Objects.requireNonNull(uriStr);
         Objects.requireNonNull(location);
-        String resolved = resolveIRI(uriStr, location);
-        int line = location.getLineNumber();
-        int col = location.getColumnNumber();
-        return parserProfile.createURI(resolved, line, col);
-    }
-
-    /** Resolve an IRI. */
-    private String resolveIRI(String uriStr, Location location) {
-        if ( uriStr.startsWith("_:") )
-            // <_:label> syntax. Handled by the FactoryRDF via the parser 
profile.
-            return uriStr;
-        return resolveIRIx(uriStr, location).str();
+        final int line = location.getLineNumber();
+        final int col = location.getColumnNumber();
+        return uriStr.startsWith("_:")
+                ?  parserProfile.createURI(uriStr, line, col) // <_:label> 
syntax. Handled by the FactoryRDF via the parser profile.
+                :  parserProfile.createURI(resolveIRIx(uriStr, location), 
line, col);
     }
 
     private IRIx resolveIRIx(String uriStr, Location location) {
@@ -1480,10 +1487,13 @@ class ParserRRX_StAX_SR {
 
     private IRIx resolveIRIxAny(String uriStr, Location location) {
         try {
-            IRIx iri = ( currentBase != null )
-                    ? currentBase.resolve(uriStr)
-                    : IRIx.create(uriStr);
-            return iri;
+            return currentIriCache.get(uriStr, uri -> {
+                if( currentBase != null ) {
+                    return currentBase.resolve(uri);
+                } else {
+                    return IRIx.create(uriStr);
+                }
+            });
         } catch (IRIException ex) {
             throw RDFXMLparseError(ex.getMessage(), location);
         }
@@ -1505,13 +1515,6 @@ class ParserRRX_StAX_SR {
         return parserProfile.createBlankNode(null, label, line, col);
     }
 
-//    private Node literal(String lex, String datatype, String lang, Location 
location) {
-//        int line = location.getLineNumber();
-//        int col = location.getColumnNumber();
-//        return parserProfile.createL
-//    }
-    // literal(lex, datatype, lang)
-
     private Node literal(String lexical, Location location) {
         Objects.requireNonNull(lexical);
         Objects.requireNonNull(location);
@@ -1741,5 +1744,4 @@ class ParserRRX_StAX_SR {
         }
         throw new RDFXMLParseException("Failed to find any non-whitespace 
characters");
     }
-
 }
diff --git 
a/jena-arq/src/main/java/org/apache/jena/riot/system/ParserProfile.java 
b/jena-arq/src/main/java/org/apache/jena/riot/system/ParserProfile.java
index de630ba467..fc454c9a12 100644
--- a/jena-arq/src/main/java/org/apache/jena/riot/system/ParserProfile.java
+++ b/jena-arq/src/main/java/org/apache/jena/riot/system/ParserProfile.java
@@ -22,6 +22,7 @@ import org.apache.jena.datatypes.RDFDatatype;
 import org.apache.jena.graph.Graph;
 import org.apache.jena.graph.Node;
 import org.apache.jena.graph.Triple;
+import org.apache.jena.irix.IRIx;
 import org.apache.jena.riot.tokens.Token;
 import org.apache.jena.sparql.core.Quad;
 
@@ -49,9 +50,12 @@ public interface ParserProfile {
     /** Create a quad */
     public Quad createQuad(Node graph, Node subject, Node predicate, Node 
object, long line, long col);
 
-    /** Create a URI Node */
+    /** Create a URI Node, where 'uriStr' could also be a blank node. */
     public Node createURI(String uriStr, long line, long col);
 
+    /** Create a URI Node */
+    public Node createURI(IRIx iriX, long line, long col);
+
     /** Create a literal for a string+datatype */
     public Node createTypedLiteral(String lexical, RDFDatatype datatype, long 
line, long col);
 
diff --git 
a/jena-arq/src/main/java/org/apache/jena/riot/system/ParserProfileStd.java 
b/jena-arq/src/main/java/org/apache/jena/riot/system/ParserProfileStd.java
index 48c6f3c82b..70b28feba1 100644
--- a/jena-arq/src/main/java/org/apache/jena/riot/system/ParserProfileStd.java
+++ b/jena-arq/src/main/java/org/apache/jena/riot/system/ParserProfileStd.java
@@ -199,6 +199,11 @@ public class ParserProfileStd implements ParserProfile {
         return factory.createURI(x);
     }
 
+    @Override
+    public Node createURI(IRIx iriX, long line, long col) {
+        return factory.createURI(iriX.str());
+    }
+
     @Override
     public Node createTypedLiteral(String lexical, RDFDatatype datatype, long 
line, long col) {
         if ( checking )
diff --git 
a/jena-arq/src/main/java/org/apache/jena/riot/system/ParserProfileWrapper.java 
b/jena-arq/src/main/java/org/apache/jena/riot/system/ParserProfileWrapper.java
index 891cc4df94..c531c1e713 100644
--- 
a/jena-arq/src/main/java/org/apache/jena/riot/system/ParserProfileWrapper.java
+++ 
b/jena-arq/src/main/java/org/apache/jena/riot/system/ParserProfileWrapper.java
@@ -22,6 +22,7 @@ import org.apache.jena.datatypes.RDFDatatype;
 import org.apache.jena.graph.Graph;
 import org.apache.jena.graph.Node;
 import org.apache.jena.graph.Triple;
+import org.apache.jena.irix.IRIx;
 import org.apache.jena.riot.tokens.Token;
 import org.apache.jena.sparql.core.Quad;
 
@@ -62,6 +63,11 @@ public class ParserProfileWrapper implements ParserProfile
         return get().createURI(uriStr, line, col);
     }
 
+    @Override
+    public Node createURI(IRIx iriX, long line, long col) {
+        return get().createURI(iriX, line, col);
+    }
+
     @Override
     public Node createTypedLiteral(String lexical, RDFDatatype datatype, long 
line, long col) {
         return get().createTypedLiteral(lexical, datatype, line, col);
diff --git a/jena-arq/src/test/java/org/apache/jena/system/TestReadXML.java 
b/jena-arq/src/test/java/org/apache/jena/system/TestReadXML.java
index 7cfdf874bb..e3c2b74e12 100644
--- a/jena-arq/src/test/java/org/apache/jena/system/TestReadXML.java
+++ b/jena-arq/src/test/java/org/apache/jena/system/TestReadXML.java
@@ -72,10 +72,19 @@ public class TestReadXML {
         assertEquals("XMLInputFactory.SUPPORT_DTD",
                      Boolean.FALSE, 
xf.getProperty(XMLInputFactory.SUPPORT_DTD));
 
-        // Java19. Setting ACCESS_EXTERNAL_DTD to "" now returns "" whereas it 
was returning null.
-        Object obj = xf.getProperty(XMLConstants.ACCESS_EXTERNAL_DTD);
-        boolean noAccessExternalDTD = ( (obj == null) || ((obj instanceof 
String) && ((String)obj).isEmpty()) );
-        assertTrue("XMLConstants.ACCESS_EXTERNAL_DTD", noAccessExternalDTD);
+
+        String name = xf.getClass().getName();
+        boolean isWoodstox = name.startsWith("com.ctc.wstx.stax.");
+        boolean isAalto = name.startsWith("com.fasterxml.aalto.");
+        if(!isWoodstox && !isAalto) {
+            // Not supported by Woodstox or Aalto. 
IS_SUPPORTING_EXTERNAL_ENTITIES = false is enough.
+            // Disable external DTDs (files and HTTP) - errors unless 
SUPPORT_DTD is false.
+
+            // Java19. Setting ACCESS_EXTERNAL_DTD to "" now returns "" 
whereas it was returning null.
+            Object obj = xf.getProperty(XMLConstants.ACCESS_EXTERNAL_DTD);
+            boolean noAccessExternalDTD = ( (obj == null) || ((obj instanceof 
String) && ((String)obj).isEmpty()) );
+            assertTrue("XMLConstants.ACCESS_EXTERNAL_DTD", 
noAccessExternalDTD);
+        }
 
         assertEquals("javax.xml.stream.isSupportingExternalEntities",
                      
Boolean.FALSE,xf.getProperty("javax.xml.stream.isSupportingExternalEntities"));
diff --git 
a/jena-benchmarks/jena-benchmarks-jmh/src/test/java/org/apache/jena/riot/lang/rdfxml/TestXMLParser.java
 
b/jena-benchmarks/jena-benchmarks-jmh/src/test/java/org/apache/jena/riot/lang/rdfxml/TestXMLParser.java
new file mode 100644
index 0000000000..e3a6c439c7
--- /dev/null
+++ 
b/jena-benchmarks/jena-benchmarks-jmh/src/test/java/org/apache/jena/riot/lang/rdfxml/TestXMLParser.java
@@ -0,0 +1,143 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.jena.riot.lang.rdfxml;
+
+import org.apache.commons.io.input.BufferedFileChannelInputStream;
+import org.apache.jena.graph.Graph;
+import org.apache.jena.mem.graph.helper.JMHDefaultOptions;
+import org.apache.jena.mem2.GraphMem2Fast;
+import org.apache.jena.riot.Lang;
+import org.apache.jena.riot.RDFParser;
+import org.junit.Assert;
+import org.junit.Test;
+import org.openjdk.jmh.annotations.*;
+import org.openjdk.jmh.runner.Runner;
+
+import java.nio.file.StandardOpenOption;
+
+@State(Scope.Benchmark)
+public class TestXMLParser {
+
+    @Param({
+            "../testing/pizza.owl.rdf",
+//            "../testing/citations.rdf",
+//            "../testing/BSBM/bsbm-5m.xml",
+
+    })
+    public String param0_GraphUri;
+
+    @Param({
+            "RRX.RDFXML_SAX",
+            "RRX.RDFXML_StAX_ev",
+            "RRX.RDFXML_StAX_sr",
+
+//            "RRX.RDFXML_ARP0",
+            "RRX.RDFXML_ARP1"
+    })
+    public String param1_ParserLang;
+
+
+    private static Lang getLang(String langName) {
+        switch (langName) {
+            case "RRX.RDFXML_SAX":
+                return RRX.RDFXML_SAX;
+            case "RRX.RDFXML_StAX_ev":
+                return RRX.RDFXML_StAX_ev;
+            case "RRX.RDFXML_StAX_sr":
+                return RRX.RDFXML_StAX_sr;
+
+            case "RRX.RDFXML_ARP0":
+                return RRX.RDFXML_ARP0;
+            case "RRX.RDFXML_ARP1":
+                return RRX.RDFXML_ARP1;
+
+            default:
+                throw new IllegalArgumentException("Unknown lang: " + 
langName);
+        }
+    }
+
+    private static org.apache.shadedJena510.riot.Lang getLangJena510(String 
langName) {
+        switch (langName) {
+            case "RRX.RDFXML_SAX":
+                return 
org.apache.shadedJena510.riot.lang.rdfxml.RRX.RDFXML_SAX;
+            case "RRX.RDFXML_StAX_ev":
+                return 
org.apache.shadedJena510.riot.lang.rdfxml.RRX.RDFXML_StAX_ev;
+            case "RRX.RDFXML_StAX_sr":
+                return 
org.apache.shadedJena510.riot.lang.rdfxml.RRX.RDFXML_StAX_sr;
+
+            case "RRX.RDFXML_ARP0":
+                return 
org.apache.shadedJena510.riot.lang.rdfxml.RRX.RDFXML_ARP0;
+            case "RRX.RDFXML_ARP1":
+                return 
org.apache.shadedJena510.riot.lang.rdfxml.RRX.RDFXML_ARP1;
+
+            default:
+                throw new IllegalArgumentException("Unknown lang: " + 
langName);
+        }
+    }
+
+    @Benchmark
+    public Graph parseXML() throws Exception {
+        final var graph = new GraphMem2Fast();
+        try(final var is = new BufferedFileChannelInputStream.Builder()
+                .setFile(this.param0_GraphUri)
+                .setOpenOptions(StandardOpenOption.READ)
+                .setBufferSize(64*4096)
+                .get()) {
+            RDFParser.source(is)
+                    .base("xx:")
+                    .forceLang(getLang(this.param1_ParserLang))
+                    .checking(false)
+                    .parse(graph);
+        }
+        return graph;
+    }
+
+    @Benchmark
+    public org.apache.shadedJena510.graph.Graph parseXMLJena510() throws 
Exception {
+        final var graph = new org.apache.shadedJena510.mem2.GraphMem2Fast();
+        try(final var is = new BufferedFileChannelInputStream.Builder()
+                .setFile(this.param0_GraphUri)
+                .setOpenOptions(StandardOpenOption.READ)
+                .setBufferSize(64*4096)
+                .get()) {
+            org.apache.shadedJena510.riot.RDFParser.source(is)
+                    .base("xx:")
+                    .forceLang(getLangJena510(this.param1_ParserLang))
+                    .checking(false)
+                    .parse(graph);
+        }
+        return graph;
+    }
+
+    @Setup(Level.Trial)
+    public void setup() {
+        org.apache.shadedJena510.riot.lang.rdfxml.RRX.register();
+    }
+
+    @Test
+    public void benchmark() throws Exception {
+        var opt = JMHDefaultOptions.getDefaults(this.getClass())
+                .warmupIterations(2)
+                .measurementIterations(4)
+                .build();
+        var results = new Runner(opt).run();
+        Assert.assertNotNull(results);
+    }
+
+}
diff --git a/jena-core/src/main/java/org/apache/jena/util/JenaXMLInput.java 
b/jena-core/src/main/java/org/apache/jena/util/JenaXMLInput.java
index 498f86d212..0283be1765 100644
--- a/jena-core/src/main/java/org/apache/jena/util/JenaXMLInput.java
+++ b/jena-core/src/main/java/org/apache/jena/util/JenaXMLInput.java
@@ -134,6 +134,7 @@ public class JenaXMLInput {
 
         String name = xmlInputFactory.getClass().getName();
         boolean isWoodstox = name.startsWith("com.ctc.wstx.stax.");
+        boolean isAalto = name.startsWith("com.fasterxml.aalto.");
         boolean isJDK = name.contains("sun.xml.internal");
         boolean isXerces = name.startsWith("org.apache.xerces");
 
@@ -146,9 +147,9 @@ public class JenaXMLInput {
         // disable external entities (silently ignore)
         setXMLInputFactoryProperty(xmlInputFactory, 
XMLInputFactory.IS_SUPPORTING_EXTERNAL_ENTITIES, Boolean.FALSE);
 
-        // Not supported by Woodstox. IS_SUPPORTING_EXTERNAL_ENTITIES = false 
is enough.
+        // Not supported by Woodstox or Aalto. IS_SUPPORTING_EXTERNAL_ENTITIES 
= false is enough.
         // Disable external DTDs (files and HTTP) - errors unless SUPPORT_DTD 
is false.
-        if ( ! isWoodstox )
+        if ( ! isWoodstox && ! isAalto)
             setXMLInputFactoryProperty(xmlInputFactory, 
XMLConstants.ACCESS_EXTERNAL_DTD, "");
 
         return xmlInputFactory;


Reply via email to